diff --git a/src/liballoc/boxed.rs b/src/liballoc/boxed.rs index 8a3950718d7d2..831856726dffc 100644 --- a/src/liballoc/boxed.rs +++ b/src/liballoc/boxed.rs @@ -78,6 +78,7 @@ use core::hash::{Hash, Hasher}; use core::iter::{Iterator, FromIterator, FusedIterator}; use core::marker::{Unpin, Unsize}; use core::mem; +use core::needle::Needle; use core::pin::Pin; use core::ops::{ CoerceUnsized, DispatchFromDyn, Deref, DerefMut, Receiver, Generator, GeneratorState @@ -919,3 +920,35 @@ impl<F: ?Sized + Future + Unpin> Future for Box<F> { F::poll(Pin::new(&mut *self), cx) } } + +#[unstable(feature = "needle", issue = "56345")] +impl<'p, 'h, T: PartialEq + 'p + 'h> Needle<&'h [T]> for &'p Box<[T]> { + type Searcher = <&'p [T] as Needle<&'h [T]>>::Searcher; + type Consumer = <&'p [T] as Needle<&'h [T]>>::Consumer; + + #[inline] + fn into_searcher(self) -> Self::Searcher { + <&'p [T] as Needle<&'h [T]>>::into_searcher(&**self) + } + + #[inline] + fn into_consumer(self) -> Self::Consumer { + <&'p [T] as Needle<&'h [T]>>::into_consumer(&**self) + } +} + +#[unstable(feature = "needle", issue = "56345")] +impl<'p, 'h, T: PartialEq + 'p + 'h> Needle<&'h mut [T]> for &'p Box<[T]> { + type Searcher = <&'p [T] as Needle<&'h mut [T]>>::Searcher; + type Consumer = <&'p [T] as Needle<&'h mut [T]>>::Consumer; + + #[inline] + fn into_searcher(self) -> Self::Searcher { + <&'p [T] as Needle<&'h mut [T]>>::into_searcher(&**self) + } + + #[inline] + fn into_consumer(self) -> Self::Consumer { + <&'p [T] as Needle<&'h mut [T]>>::into_consumer(&**self) + } +} diff --git a/src/liballoc/lib.rs b/src/liballoc/lib.rs index 63b3fbbdaefe1..20d132c897448 100644 --- a/src/liballoc/lib.rs +++ b/src/liballoc/lib.rs @@ -91,7 +91,7 @@ #![feature(needs_allocator)] #![feature(nll)] #![feature(optin_builtin_traits)] -#![feature(pattern)] +#![feature(needle)] #![feature(ptr_internals)] #![feature(ptr_offset_from)] #![feature(rustc_attrs)] diff --git a/src/liballoc/slice.rs b/src/liballoc/slice.rs index 6eac848740106..68d1f6cc8b787 100644 --- a/src/liballoc/slice.rs +++ b/src/liballoc/slice.rs @@ -92,6 +92,7 @@ use core::cmp::Ordering::{self, Less}; use core::mem::{self, size_of}; use core::ptr; use core::{u8, u16, u32}; +use core::needle::{ext, Needle, Searcher, Consumer}; use crate::borrow::ToOwned; use crate::boxed::Box; @@ -485,6 +486,40 @@ impl<T> [T] { } buf } + + /// Replaces all matches of a predicate with another slice. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + #[must_use = "this returns the replaced slice as a new allocation, \ + without modifying the original"] + pub fn replace<'s: 'a, 'a, F>(&'s self, from: F, to: &'a [T]) -> Vec<T> + where + T: Clone, + F: Needle<&'a [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + let mut result = Vec::with_capacity(self.len()); + ext::replace_with(self, from, |_| to, |s| result.extend_from_slice(s)); + result + } + + /// Replaces first N matches of a predicate with another slice. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + #[must_use = "this returns the replaced slice as a new allocation, \ + without modifying the original"] + pub fn replacen<'s: 'a, 'a, F>(&'s self, from: F, to: &'a [T], count: usize) -> Vec<T> + where + T: Clone, + F: Needle<&'a [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + let mut result = Vec::with_capacity(self.len()); + ext::replacen_with(self, from, |_| to, count, |s| result.extend_from_slice(s)); + result + } } #[lang = "slice_u8_alloc"] diff --git a/src/liballoc/str.rs b/src/liballoc/str.rs index f10a01d44c8ee..299474a89f287 100644 --- a/src/liballoc/str.rs +++ b/src/liballoc/str.rs @@ -28,14 +28,14 @@ // It's cleaner to just turn off the unused_imports warning than to fix them. #![allow(unused_imports)] -use core::borrow::Borrow; -use core::str::pattern::{Pattern, Searcher, ReverseSearcher, DoubleEndedSearcher}; +use core::fmt; +use core::needle::{ext, Needle, Searcher, Consumer}; use core::mem; use core::ptr; use core::iter::FusedIterator; use core::unicode::conversions; -use crate::borrow::ToOwned; +use crate::borrow::{Borrow, ToOwned}; use crate::boxed::Box; use crate::slice::{SliceConcatExt, SliceIndex}; use crate::string::String; @@ -62,8 +62,6 @@ pub use core::str::{from_utf8, from_utf8_mut, Chars, CharIndices, Bytes}; pub use core::str::{from_utf8_unchecked, from_utf8_unchecked_mut, ParseBoolError}; #[stable(feature = "rust1", since = "1.0.0")] pub use core::str::SplitWhitespace; -#[stable(feature = "rust1", since = "1.0.0")] -pub use core::str::pattern; #[stable(feature = "encode_utf16", since = "1.8.0")] pub use core::str::EncodeUtf16; #[stable(feature = "split_ascii_whitespace", since = "1.34.0")] @@ -255,15 +253,14 @@ impl str { without modifying the original"] #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn replace<'a, P: Pattern<'a>>(&'a self, from: P, to: &str) -> String { - let mut result = String::new(); - let mut last_end = 0; - for (start, part) in self.match_indices(from) { - result.push_str(unsafe { self.get_unchecked(last_end..start) }); - result.push_str(to); - last_end = start + part.len(); - } - result.push_str(unsafe { self.get_unchecked(last_end..self.len()) }); + pub fn replace<'s: 'a, 'a, P>(&'s self, from: P, to: &'a str) -> String + where + P: Needle<&'a str>, + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + let mut result = String::with_capacity(self.len()); + ext::replace_with(self, from, |_| to, |s| result.push_str(s)); result } @@ -295,16 +292,15 @@ impl str { #[must_use = "this returns the replaced string as a new allocation, \ without modifying the original"] #[stable(feature = "str_replacen", since = "1.16.0")] - pub fn replacen<'a, P: Pattern<'a>>(&'a self, pat: P, to: &str, count: usize) -> String { + pub fn replacen<'s: 'a, 'a, P>(&'s self, pat: P, to: &'a str, count: usize) -> String + where + P: Needle<&'a str>, + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { // Hope to reduce the times of re-allocation - let mut result = String::with_capacity(32); - let mut last_end = 0; - for (start, part) in self.match_indices(pat).take(count) { - result.push_str(unsafe { self.get_unchecked(last_end..start) }); - result.push_str(to); - last_end = start + part.len(); - } - result.push_str(unsafe { self.get_unchecked(last_end..self.len()) }); + let mut result = String::with_capacity(self.len()); + ext::replacen_with(self, pat, |_| to, count, |s| result.push_str(s)); result } diff --git a/src/liballoc/string.rs b/src/liballoc/string.rs index a3e2098695f70..44972d0be3123 100644 --- a/src/liballoc/string.rs +++ b/src/liballoc/string.rs @@ -53,7 +53,8 @@ use core::iter::{FromIterator, FusedIterator}; use core::ops::{self, Add, AddAssign, Index, IndexMut, RangeBounds}; use core::ops::Bound::{Excluded, Included, Unbounded}; use core::ptr; -use core::str::{pattern::Pattern, lossy}; +use core::needle::Needle; +use core::str::lossy; use crate::borrow::{Cow, ToOwned}; use crate::collections::CollectionAllocErr; @@ -1792,24 +1793,19 @@ impl<'a> Extend<Cow<'a, str>> for String { } /// A convenience impl that delegates to the impl for `&str` -#[unstable(feature = "pattern", - reason = "API not fully fleshed out and ready to be stabilized", - issue = "27721")] -impl<'a, 'b> Pattern<'a> for &'b String { - type Searcher = <&'b str as Pattern<'a>>::Searcher; - - fn into_searcher(self, haystack: &'a str) -> <&'b str as Pattern<'a>>::Searcher { - self[..].into_searcher(haystack) - } +#[unstable(feature = "needle", issue = "56345")] +impl<'a, 'b> Needle<&'a str> for &'b String { + type Searcher = <&'b str as Needle<&'a str>>::Searcher; + type Consumer = <&'b str as Needle<&'a str>>::Consumer; #[inline] - fn is_contained_in(self, haystack: &'a str) -> bool { - self[..].is_contained_in(haystack) + fn into_searcher(self) -> Self::Searcher { + <&'b str as Needle<&'a str>>::into_searcher(&**self) } #[inline] - fn is_prefix_of(self, haystack: &'a str) -> bool { - self[..].is_prefix_of(haystack) + fn into_consumer(self) -> Self::Consumer { + <&'b str as Needle<&'a str>>::into_consumer(&**self) } } diff --git a/src/liballoc/tests/lib.rs b/src/liballoc/tests/lib.rs index 90921b6af9f34..bb3f399fc254b 100644 --- a/src/liballoc/tests/lib.rs +++ b/src/liballoc/tests/lib.rs @@ -2,11 +2,13 @@ #![feature(box_syntax)] #![feature(drain_filter)] #![feature(exact_size_is_empty)] -#![feature(pattern)] #![feature(repeat_generic_slice)] +#![feature(needle)] #![feature(try_reserve)] #![feature(unboxed_closures)] #![feature(vecdeque_rotate)] +#![feature(mut_str_needle_methods)] +#![feature(slice_needle_methods)] use std::hash::{Hash, Hasher}; use std::collections::hash_map::DefaultHasher; diff --git a/src/liballoc/tests/slice.rs b/src/liballoc/tests/slice.rs index ad2cd7c95eb8f..c2a7d6ed3ade4 100644 --- a/src/liballoc/tests/slice.rs +++ b/src/liballoc/tests/slice.rs @@ -5,6 +5,7 @@ use std::panic; use std::rc::Rc; use std::sync::atomic::{Ordering::Relaxed, AtomicUsize}; use std::thread; +use std::f64::NAN; use rand::{Rng, RngCore, thread_rng}; use rand::seq::SliceRandom; @@ -844,19 +845,19 @@ fn test_splitator() { let xs = &[1, 2, 3, 4, 5]; let splits: &[&[_]] = &[&[1], &[3], &[5]]; - assert_eq!(xs.split(|x| *x % 2 == 0).collect::<Vec<_>>(), splits); + assert_eq!(xs.split(|x: &i32| *x % 2 == 0).collect::<Vec<_>>(), splits); let splits: &[&[_]] = &[&[], &[2, 3, 4, 5]]; - assert_eq!(xs.split(|x| *x == 1).collect::<Vec<_>>(), splits); + assert_eq!(xs.split(|x: &i32| *x == 1).collect::<Vec<_>>(), splits); let splits: &[&[_]] = &[&[1, 2, 3, 4], &[]]; - assert_eq!(xs.split(|x| *x == 5).collect::<Vec<_>>(), splits); + assert_eq!(xs.split(|x: &i32| *x == 5).collect::<Vec<_>>(), splits); let splits: &[&[_]] = &[&[1, 2, 3, 4, 5]]; - assert_eq!(xs.split(|x| *x == 10).collect::<Vec<_>>(), splits); + assert_eq!(xs.split(|x: &i32| *x == 10).collect::<Vec<_>>(), splits); let splits: &[&[_]] = &[&[], &[], &[], &[], &[], &[]]; - assert_eq!(xs.split(|_| true).collect::<Vec<&[i32]>>(), splits); + assert_eq!(xs.split(|_: &i32| true).collect::<Vec<&[i32]>>(), splits); let xs: &[i32] = &[]; let splits: &[&[i32]] = &[&[]]; - assert_eq!(xs.split(|x| *x == 5).collect::<Vec<&[i32]>>(), splits); + assert_eq!(xs.split(|x: &i32| *x == 5).collect::<Vec<&[i32]>>(), splits); } #[test] @@ -864,15 +865,15 @@ fn test_splitnator() { let xs = &[1, 2, 3, 4, 5]; let splits: &[&[_]] = &[&[1, 2, 3, 4, 5]]; - assert_eq!(xs.splitn(1, |x| *x % 2 == 0).collect::<Vec<_>>(), splits); + assert_eq!(xs.splitn(1, |x: &i32| *x % 2 == 0).collect::<Vec<_>>(), splits); let splits: &[&[_]] = &[&[1], &[3, 4, 5]]; - assert_eq!(xs.splitn(2, |x| *x % 2 == 0).collect::<Vec<_>>(), splits); + assert_eq!(xs.splitn(2, |x: &i32| *x % 2 == 0).collect::<Vec<_>>(), splits); let splits: &[&[_]] = &[&[], &[], &[], &[4, 5]]; - assert_eq!(xs.splitn(4, |_| true).collect::<Vec<_>>(), splits); + assert_eq!(xs.splitn(4, |_: &i32| true).collect::<Vec<_>>(), splits); let xs: &[i32] = &[]; let splits: &[&[i32]] = &[&[]]; - assert_eq!(xs.splitn(2, |x| *x == 5).collect::<Vec<_>>(), splits); + assert_eq!(xs.splitn(2, |x: &i32| *x == 5).collect::<Vec<_>>(), splits); } #[test] @@ -880,17 +881,17 @@ fn test_splitnator_mut() { let xs = &mut [1, 2, 3, 4, 5]; let splits: &[&mut [_]] = &[&mut [1, 2, 3, 4, 5]]; - assert_eq!(xs.splitn_mut(1, |x| *x % 2 == 0).collect::<Vec<_>>(), + assert_eq!(xs.splitn_mut(1, |x: &i32| *x % 2 == 0).collect::<Vec<_>>(), splits); let splits: &[&mut [_]] = &[&mut [1], &mut [3, 4, 5]]; - assert_eq!(xs.splitn_mut(2, |x| *x % 2 == 0).collect::<Vec<_>>(), + assert_eq!(xs.splitn_mut(2, |x: &i32| *x % 2 == 0).collect::<Vec<_>>(), splits); let splits: &[&mut [_]] = &[&mut [], &mut [], &mut [], &mut [4, 5]]; - assert_eq!(xs.splitn_mut(4, |_| true).collect::<Vec<_>>(), splits); + assert_eq!(xs.splitn_mut(4, |_: &i32| true).collect::<Vec<_>>(), splits); let xs: &mut [i32] = &mut []; let splits: &[&mut [i32]] = &[&mut []]; - assert_eq!(xs.splitn_mut(2, |x| *x == 5).collect::<Vec<_>>(), splits); + assert_eq!(xs.splitn_mut(2, |x: &i32| *x == 5).collect::<Vec<_>>(), splits); } #[test] @@ -898,17 +899,17 @@ fn test_rsplitator() { let xs = &[1, 2, 3, 4, 5]; let splits: &[&[_]] = &[&[5], &[3], &[1]]; - assert_eq!(xs.split(|x| *x % 2 == 0).rev().collect::<Vec<_>>(), splits); + assert_eq!(xs.split(|x: &i32| *x % 2 == 0).rev().collect::<Vec<_>>(), splits); let splits: &[&[_]] = &[&[2, 3, 4, 5], &[]]; - assert_eq!(xs.split(|x| *x == 1).rev().collect::<Vec<_>>(), splits); + assert_eq!(xs.split(|x: &i32| *x == 1).rev().collect::<Vec<_>>(), splits); let splits: &[&[_]] = &[&[], &[1, 2, 3, 4]]; - assert_eq!(xs.split(|x| *x == 5).rev().collect::<Vec<_>>(), splits); + assert_eq!(xs.split(|x: &i32| *x == 5).rev().collect::<Vec<_>>(), splits); let splits: &[&[_]] = &[&[1, 2, 3, 4, 5]]; - assert_eq!(xs.split(|x| *x == 10).rev().collect::<Vec<_>>(), splits); + assert_eq!(xs.split(|x: &i32| *x == 10).rev().collect::<Vec<_>>(), splits); let xs: &[i32] = &[]; let splits: &[&[i32]] = &[&[]]; - assert_eq!(xs.split(|x| *x == 5).rev().collect::<Vec<&[i32]>>(), splits); + assert_eq!(xs.split(|x: &i32| *x == 5).rev().collect::<Vec<&[i32]>>(), splits); } #[test] @@ -916,16 +917,16 @@ fn test_rsplitnator() { let xs = &[1, 2, 3, 4, 5]; let splits: &[&[_]] = &[&[1, 2, 3, 4, 5]]; - assert_eq!(xs.rsplitn(1, |x| *x % 2 == 0).collect::<Vec<_>>(), splits); + assert_eq!(xs.rsplitn(1, |x: &i32| *x % 2 == 0).collect::<Vec<_>>(), splits); let splits: &[&[_]] = &[&[5], &[1, 2, 3]]; - assert_eq!(xs.rsplitn(2, |x| *x % 2 == 0).collect::<Vec<_>>(), splits); + assert_eq!(xs.rsplitn(2, |x: &i32| *x % 2 == 0).collect::<Vec<_>>(), splits); let splits: &[&[_]] = &[&[], &[], &[], &[1, 2]]; - assert_eq!(xs.rsplitn(4, |_| true).collect::<Vec<_>>(), splits); + assert_eq!(xs.rsplitn(4, |_: &i32| true).collect::<Vec<_>>(), splits); let xs: &[i32] = &[]; let splits: &[&[i32]] = &[&[]]; - assert_eq!(xs.rsplitn(2, |x| *x == 5).collect::<Vec<&[i32]>>(), splits); - assert!(xs.rsplitn(0, |x| *x % 2 == 0).next().is_none()); + assert_eq!(xs.rsplitn(2, |x: &i32| *x == 5).collect::<Vec<&[i32]>>(), splits); + assert!(xs.rsplitn(0, |x: &i32| *x % 2 == 0).next().is_none()); } #[test] @@ -1218,14 +1219,14 @@ fn test_ends_with() { #[test] fn test_mut_splitator() { let mut xs = [0, 1, 0, 2, 3, 0, 0, 4, 5, 0]; - assert_eq!(xs.split_mut(|x| *x == 0).count(), 6); - for slice in xs.split_mut(|x| *x == 0) { + assert_eq!(xs.split_mut(|x: &i32| *x == 0).count(), 6); + for slice in xs.split_mut(|x: &i32| *x == 0) { slice.reverse(); } assert!(xs == [0, 1, 0, 3, 2, 0, 0, 5, 4, 0]); let mut xs = [0, 1, 0, 2, 3, 0, 0, 4, 5, 0, 6, 7]; - for slice in xs.split_mut(|x| *x == 0).take(5) { + for slice in xs.split_mut(|x: &i32| *x == 0).take(5) { slice.reverse(); } assert!(xs == [0, 1, 0, 3, 2, 0, 0, 5, 4, 0, 6, 7]); @@ -1234,7 +1235,7 @@ fn test_mut_splitator() { #[test] fn test_mut_splitator_rev() { let mut xs = [1, 2, 0, 3, 4, 0, 0, 5, 6, 0]; - for slice in xs.split_mut(|x| *x == 0).rev().take(4) { + for slice in xs.split_mut(|x: &i32| *x == 0).rev().take(4) { slice.reverse(); } assert!(xs == [1, 2, 0, 4, 3, 0, 0, 6, 5, 0]); @@ -1655,3 +1656,63 @@ fn repeat_generic_slice() { vec![1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4] ); } + +#[test] +fn test_match_indices_simple() { + let haystack = &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 2.0, 3.0, 2.0, 4.0, 8.0][..]; + let needle = &[2.0, 3.0][..]; + + assert_eq!(haystack.match_indices(needle).collect::<Vec<_>>(), vec![ + (1, needle), + (8, needle), + ]); +} + +#[test] +fn test_match_indices_nan_haystack() { + let haystack = &[1.0, 2.0, NAN, 1.0, 2.0, NAN, 1.0, NAN, NAN, NAN, 2.0, 1.0, 2.0][..]; + let needle = &[1.0, 2.0][..]; + + assert_eq!(haystack.match_indices(needle).collect::<Vec<_>>(), vec![ + (0, needle), + (3, needle), + (11, needle), + ]); +} + +#[test] +fn test_match_indices_nan_needle() { + let haystack = &[1.0, 2.0, NAN, 1.0, 2.0, NAN, 1.0, NAN, NAN, NAN, 2.0, 1.0, 2.0][..]; + let needle = &[2.0, NAN][..]; + + assert_eq!(haystack.match_indices(needle).collect::<Vec<_>>(), vec![ + ]); +} + +#[test] +fn test_match_indices_negative_zero() { + let haystack = &[-0.0, 0.0, 0.0, -0.0, 0.0][..]; + let needle = &[0.0, -0.0][..]; + + assert_eq!(haystack.match_indices(needle).collect::<Vec<_>>(), vec![ + (0, needle), + (2, needle), + ]); +} + +#[test] +fn test_replace() { + let haystack = &b" empowering everyone to build reliable and efficient software."[..]; + + assert_eq!( + haystack.replace(&b" e"[..], b" **E**"), + b" **E**mpowering **E**veryone to build reliable and **E**fficient software.".to_vec() + ); +} + +#[test] +fn test_boxed_slice_ref_is_a_needle() { + let boundary = vec![b'x'; 12].into_boxed_slice(); + let bytes: &[u8] = b"--xxxxxxxxxxxxyyyy"; + assert!(bytes[2..].starts_with(&boundary)); +} diff --git a/src/liballoc/tests/str.rs b/src/liballoc/tests/str.rs index b197516403f78..89d245950c359 100644 --- a/src/liballoc/tests/str.rs +++ b/src/liballoc/tests/str.rs @@ -1606,145 +1606,6 @@ fn test_repeat() { assert_eq!("α".repeat(3), "ααα"); } -mod pattern { - use std::str::pattern::{Pattern, Searcher, ReverseSearcher}; - use std::str::pattern::SearchStep::{self, Match, Reject, Done}; - - macro_rules! make_test { - ($name:ident, $p:expr, $h:expr, [$($e:expr,)*]) => { - #[allow(unused_imports)] - mod $name { - use std::str::pattern::SearchStep::{Match, Reject}; - use super::{cmp_search_to_vec}; - #[test] - fn fwd() { - cmp_search_to_vec(false, $p, $h, vec![$($e),*]); - } - #[test] - fn bwd() { - cmp_search_to_vec(true, $p, $h, vec![$($e),*]); - } - } - } - } - - fn cmp_search_to_vec<'a, P: Pattern<'a>>(rev: bool, pat: P, haystack: &'a str, - right: Vec<SearchStep>) - where P::Searcher: ReverseSearcher<'a> - { - let mut searcher = pat.into_searcher(haystack); - let mut v = vec![]; - loop { - match if !rev {searcher.next()} else {searcher.next_back()} { - Match(a, b) => v.push(Match(a, b)), - Reject(a, b) => v.push(Reject(a, b)), - Done => break, - } - } - if rev { - v.reverse(); - } - - let mut first_index = 0; - let mut err = None; - - for (i, e) in right.iter().enumerate() { - match *e { - Match(a, b) | Reject(a, b) - if a <= b && a == first_index => { - first_index = b; - } - _ => { - err = Some(i); - break; - } - } - } - - if let Some(err) = err { - panic!("Input skipped range at {}", err); - } - - if first_index != haystack.len() { - panic!("Did not cover whole input"); - } - - assert_eq!(v, right); - } - - make_test!(str_searcher_ascii_haystack, "bb", "abbcbbd", [ - Reject(0, 1), - Match (1, 3), - Reject(3, 4), - Match (4, 6), - Reject(6, 7), - ]); - make_test!(str_searcher_ascii_haystack_seq, "bb", "abbcbbbbd", [ - Reject(0, 1), - Match (1, 3), - Reject(3, 4), - Match (4, 6), - Match (6, 8), - Reject(8, 9), - ]); - make_test!(str_searcher_empty_needle_ascii_haystack, "", "abbcbbd", [ - Match (0, 0), - Reject(0, 1), - Match (1, 1), - Reject(1, 2), - Match (2, 2), - Reject(2, 3), - Match (3, 3), - Reject(3, 4), - Match (4, 4), - Reject(4, 5), - Match (5, 5), - Reject(5, 6), - Match (6, 6), - Reject(6, 7), - Match (7, 7), - ]); - make_test!(str_searcher_multibyte_haystack, " ", "├──", [ - Reject(0, 3), - Reject(3, 6), - Reject(6, 9), - ]); - make_test!(str_searcher_empty_needle_multibyte_haystack, "", "├──", [ - Match (0, 0), - Reject(0, 3), - Match (3, 3), - Reject(3, 6), - Match (6, 6), - Reject(6, 9), - Match (9, 9), - ]); - make_test!(str_searcher_empty_needle_empty_haystack, "", "", [ - Match(0, 0), - ]); - make_test!(str_searcher_nonempty_needle_empty_haystack, "├", "", [ - ]); - make_test!(char_searcher_ascii_haystack, 'b', "abbcbbd", [ - Reject(0, 1), - Match (1, 2), - Match (2, 3), - Reject(3, 4), - Match (4, 5), - Match (5, 6), - Reject(6, 7), - ]); - make_test!(char_searcher_multibyte_haystack, ' ', "├──", [ - Reject(0, 3), - Reject(3, 6), - Reject(6, 9), - ]); - make_test!(char_searcher_short_haystack, '\u{1F4A9}', "* \t", [ - Reject(0, 1), - Reject(1, 2), - Reject(2, 3), - ]); - -} - macro_rules! generate_iterator_test { { $name:ident { @@ -1837,13 +1698,51 @@ generate_iterator_test! { #[test] fn different_str_pattern_forwarding_lifetimes() { - use std::str::pattern::Pattern; + // FIXME: The generic form (see PR 31989) causes a strange "trait bound not satisfied" error. + // revisit this after RFC 2089 is implemented. - fn foo<'a, P>(p: P) where for<'b> &'b P: Pattern<'a> { + fn foo(p: &str) { for _ in 0..3 { "asdf".find(&p); } } - foo::<&str>("x"); + foo("x"); +} + +#[test] +fn test_mut_str() { + use std::ops::Range; + + let mut s = String::from("a1b2c3d4e"); + { + let res: &mut str = s.trim_matches_mut(|c: char| c.is_ascii_alphabetic()); + assert_eq!(res, "1b2c3d4"); + } + { + let res: Vec<&mut str> = s.split_mut(|c: char| c.is_ascii_digit()).collect(); + assert_eq!(res, vec!["a", "b", "c", "d", "e"]); + } + { + let res: Vec<(Range<usize>, &mut str)> = + s.match_ranges_mut(|c: char| c.is_ascii_digit()).collect(); + let res = res.into_iter().map(|(r, ss)| (r, &*ss)).collect::<Vec<_>>(); + assert_eq!(res, vec![ + (1..2, "1"), + (3..4, "2"), + (5..6, "3"), + (7..8, "4"), + ]); + } + { + let res: Vec<(Range<usize>, &mut str)> = + s.rmatch_ranges_mut(|c: char| c.is_ascii_digit()).collect(); + let res = res.into_iter().map(|(r, ss)| (r, &*ss)).collect::<Vec<_>>(); + assert_eq!(res, vec![ + (7..8, "4"), + (5..6, "3"), + (3..4, "2"), + (1..2, "1"), + ]); + } } diff --git a/src/liballoc/vec.rs b/src/liballoc/vec.rs index cd62c3e05244c..1f826da6b064b 100644 --- a/src/liballoc/vec.rs +++ b/src/liballoc/vec.rs @@ -67,6 +67,7 @@ use core::ops::{self, Index, IndexMut, RangeBounds}; use core::ops::Bound::{Excluded, Included, Unbounded}; use core::ptr::{self, NonNull}; use core::slice::{self, SliceIndex}; +use core::needle::Needle; use crate::borrow::{ToOwned, Cow}; use crate::collections::CollectionAllocErr; @@ -2724,3 +2725,35 @@ impl<T, F> Drop for DrainFilter<'_, T, F> } } } + +#[unstable(feature = "needle", issue = "56345")] +impl<'p, 'h, T: PartialEq + 'p + 'h> Needle<&'h [T]> for &'p Vec<T> { + type Searcher = <&'p [T] as Needle<&'h [T]>>::Searcher; + type Consumer = <&'p [T] as Needle<&'h [T]>>::Consumer; + + #[inline] + fn into_searcher(self) -> Self::Searcher { + <&'p [T] as Needle<&'h [T]>>::into_searcher(&**self) + } + + #[inline] + fn into_consumer(self) -> Self::Consumer { + <&'p [T] as Needle<&'h [T]>>::into_consumer(&**self) + } +} + +#[unstable(feature = "needle", issue = "56345")] +impl<'p, 'h, T: PartialEq + 'p + 'h> Needle<&'h mut [T]> for &'p Vec<T> { + type Searcher = <&'p [T] as Needle<&'h mut [T]>>::Searcher; + type Consumer = <&'p [T] as Needle<&'h mut [T]>>::Consumer; + + #[inline] + fn into_searcher(self) -> Self::Searcher { + <&'p [T] as Needle<&'h mut [T]>>::into_searcher(&**self) + } + + #[inline] + fn into_consumer(self) -> Self::Consumer { + <&'p [T] as Needle<&'h mut [T]>>::into_consumer(&**self) + } +} diff --git a/src/libcore/lib.rs b/src/libcore/lib.rs index 28db55578c3de..2e6ee06206d13 100644 --- a/src/libcore/lib.rs +++ b/src/libcore/lib.rs @@ -211,6 +211,7 @@ pub mod fmt; pub mod time; pub mod unicode; +pub mod needle; /* Async */ pub mod future; diff --git a/src/libcore/needle/ext.rs b/src/libcore/needle/ext.rs new file mode 100644 index 0000000000000..774e870a6a016 --- /dev/null +++ b/src/libcore/needle/ext.rs @@ -0,0 +1,969 @@ +//! Extension functions which can be applied on any pairs of [`Haystack`]/[`Needle`]. + +use super::haystack::{Hay, Haystack, Span}; +use super::needle::{ + Needle, Searcher, ReverseSearcher, DoubleEndedSearcher, + Consumer, ReverseConsumer, DoubleEndedConsumer, +}; +use crate::iter::FusedIterator; +use crate::ops::Range; +use crate::fmt; + +macro_rules! generate_clone_and_debug { + ($name:ident, $field:tt) => { + impl<H, S> Clone for $name<H, S> + where + H: Haystack + Clone, + S: Clone, + H::Target: Hay, // FIXME: RFC 2089 or 2289 + { + fn clone(&self) -> Self { + $name { $field: self.$field.clone() } + } + fn clone_from(&mut self, src: &Self) { + self.$field.clone_from(&src.$field); + } + } + + impl<H, S> fmt::Debug for $name<H, S> + where + H: Haystack + fmt::Debug, + S: fmt::Debug, + H::Target: Hay, // FIXME: RFC 2089 or 2289 + { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple(stringify!($name)) + .field(&self.$field) + .finish() + } + } + } +} + +macro_rules! generate_pattern_iterators { + { + // Forward iterator + forward: + $(#[$forward_iterator_attribute:meta])* + struct $forward_iterator:ident; + + // Reverse iterator + reverse: + $(#[$reverse_iterator_attribute:meta])* + struct $reverse_iterator:ident; + + // Stability of all generated items + stability: + $(#[$common_stability_attribute:meta])* + + // Internal almost-iterator that is being delegated to + internal: + $internal_iterator:ident yielding ($iterty:ty); + + // Kind of delegation - either single ended or double ended + delegate $($t:tt)* + } => { + $(#[$forward_iterator_attribute])* + $(#[$common_stability_attribute])* + pub struct $forward_iterator<H, S>($internal_iterator<H, S>) + where + H::Target: Hay, // FIXME: RFC 2089 or 2289 + H: Haystack; + + generate_clone_and_debug!($forward_iterator, 0); + + $(#[$common_stability_attribute])* + impl<H, S> Iterator for $forward_iterator<H, S> + where + H: Haystack, + S: Searcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 + { + type Item = $iterty; + + #[inline] + fn next(&mut self) -> Option<Self::Item> { + self.0.next() + } + } + + $(#[$reverse_iterator_attribute])* + $(#[$common_stability_attribute])* + pub struct $reverse_iterator<H, S>($internal_iterator<H, S>) + where + H::Target: Hay, // FIXME: RFC 2089 or 2289 + H: Haystack; + + generate_clone_and_debug!($reverse_iterator, 0); + + $(#[$common_stability_attribute])* + impl<H, S> Iterator for $reverse_iterator<H, S> + where + H: Haystack, + S: ReverseSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 + { + type Item = $iterty; + + #[inline] + fn next(&mut self) -> Option<Self::Item> { + self.0.next_back() + } + } + + #[stable(feature = "fused", since = "1.26.0")] + impl<H, S> FusedIterator for $forward_iterator<H, S> + where + H: Haystack, + S: Searcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 + {} + + #[stable(feature = "fused", since = "1.26.0")] + impl<H, S> FusedIterator for $reverse_iterator<H, S> + where + H: Haystack, + S: ReverseSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 + {} + + generate_pattern_iterators!($($t)* with $(#[$common_stability_attribute])*, + $forward_iterator, + $reverse_iterator); + }; + { + double ended; with $(#[$common_stability_attribute:meta])*, + $forward_iterator:ident, + $reverse_iterator:ident + } => { + $(#[$common_stability_attribute])* + impl<H, S> DoubleEndedIterator for $forward_iterator<H, S> + where + H: Haystack, + S: DoubleEndedSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 + { + #[inline] + fn next_back(&mut self) -> Option<Self::Item> { + self.0.next_back() + } + } + + $(#[$common_stability_attribute])* + impl<H, S> DoubleEndedIterator for $reverse_iterator<H, S> + where + H: Haystack, + S: DoubleEndedSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 + { + #[inline] + fn next_back(&mut self) -> Option<Self::Item> { + self.0.next() + } + } + }; + { + single ended; with $(#[$common_stability_attribute:meta])*, + $forward_iterator:ident, + $reverse_iterator:ident + } => {} +} + +//------------------------------------------------------------------------------ +// Starts with / Ends with +//------------------------------------------------------------------------------ + +/// Returns `true` if the given needle matches a prefix of the haystack. +/// +/// Returns `false` if it does not. +pub fn starts_with<H, P>(haystack: H, needle: P) -> bool +where + H: Haystack, + P: Needle<H>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + needle.into_consumer().consume((*haystack).into()).is_some() +} + +/// Returns `true` if the given needle matches a suffix of this haystack. +/// +/// Returns `false` if it does not. +#[inline] +pub fn ends_with<H, P>(haystack: H, needle: P) -> bool +where + H: Haystack, + P: Needle<H>, + P::Consumer: ReverseConsumer<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + needle.into_consumer().rconsume((*haystack).into()).is_some() +} + +//------------------------------------------------------------------------------ +// Trim +//------------------------------------------------------------------------------ + +/// Returns a haystack slice with all prefixes that match the needle repeatedly removed. +#[inline] +pub fn trim_start<H, P>(haystack: H, needle: P) -> H +where + H: Haystack, + P: Needle<H>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + let range = { + let hay = &*haystack; + let start = needle.into_consumer().trim_start(hay); + let end = hay.end_index(); + start..end + }; + unsafe { haystack.slice_unchecked(range) } +} + +/// Returns a haystack slice with all suffixes that match the needle repeatedly removed. +pub fn trim_end<H, P>(haystack: H, needle: P) -> H +where + H: Haystack, + P: Needle<H>, + P::Consumer: ReverseConsumer<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + let range = { + let hay = &*haystack; + let start = hay.start_index(); + let end = needle.into_consumer().trim_end(hay); + start..end + }; + unsafe { haystack.slice_unchecked(range) } +} + +/// Returns a haystack slice with all prefixes and suffixes that match the needle +/// repeatedly removed. +pub fn trim<H, P>(haystack: H, needle: P) -> H +where + H: Haystack, + P: Needle<H>, + P::Consumer: DoubleEndedConsumer<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + let mut checker = needle.into_consumer(); + let range = { + let hay = &*haystack; + let end = checker.trim_end(hay); + let hay = unsafe { Hay::slice_unchecked(hay, hay.start_index()..end) }; + let start = checker.trim_start(hay); + start..end + }; + unsafe { haystack.slice_unchecked(range) } +} + +//------------------------------------------------------------------------------ +// Matches +//------------------------------------------------------------------------------ + +#[derive(Debug, Clone)] +struct MatchesInternal<H, S> +where + H: Haystack, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + searcher: S, + rest: Span<H>, +} + +impl<H, S> MatchesInternal<H, S> +where + H: Haystack, + S: Searcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + #[inline] + fn next_spanned(&mut self) -> Option<Span<H>> { + let rest = self.rest.take(); + let range = self.searcher.search(rest.borrow())?; + let [_, middle, right] = unsafe { rest.split_around(range) }; + self.rest = right; + Some(middle) + } + + #[inline] + fn next(&mut self) -> Option<H> { + Some(Span::into(self.next_spanned()?)) + } +} + +impl<H, S> MatchesInternal<H, S> +where + H: Haystack, + S: ReverseSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + #[inline] + fn next_back_spanned(&mut self) -> Option<Span<H>> { + let rest = self.rest.take(); + let range = self.searcher.rsearch(rest.borrow())?; + let [left, middle, _] = unsafe { rest.split_around(range) }; + self.rest = left; + Some(middle) + } + + #[inline] + fn next_back(&mut self) -> Option<H> { + Some(Span::into(self.next_back_spanned()?)) + } +} + +generate_pattern_iterators! { + forward: + /// Created with the function [`matches`](fn.matches.html). + struct Matches; + reverse: + /// Created with the function [`rmatches`](fn.rmatches.html). + struct RMatches; + stability: + internal: + MatchesInternal yielding (H); + delegate double ended; +} + +/// An iterator over the disjoint matches of the needle within the given haystack. +pub fn matches<H, P>(haystack: H, needle: P) -> Matches<H, P::Searcher> +where + H: Haystack, + P: Needle<H>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + Matches(MatchesInternal { + searcher: needle.into_searcher(), + rest: haystack.into(), + }) +} + +/// An iterator over the disjoint matches of the needle within the haystack, +/// yielded in reverse order. +pub fn rmatches<H, P>(haystack: H, needle: P) -> RMatches<H, P::Searcher> +where + H: Haystack, + P: Needle<H>, + P::Searcher: ReverseSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + RMatches(MatchesInternal { + searcher: needle.into_searcher(), + rest: haystack.into(), + }) +} + +/// Returns `true` if the given needle matches a sub-slice of the haystack. +/// +/// Returns `false` if it does not. +pub fn contains<H, P>(haystack: H, needle: P) -> bool +where + H: Haystack, + P: Needle<H>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + needle.into_searcher() + .search((*haystack).into()) + .is_some() +} + +//------------------------------------------------------------------------------ +// MatchIndices +//------------------------------------------------------------------------------ + +struct MatchIndicesInternal<H, S> +where + H: Haystack, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + inner: MatchesInternal<H, S>, +} + +generate_clone_and_debug!(MatchIndicesInternal, inner); + +impl<H, S> MatchIndicesInternal<H, S> +where + H: Haystack, + S: Searcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + #[inline] + fn next(&mut self) -> Option<(<H::Target as Hay>::Index, H)> { + let span = self.inner.next_spanned()?; + let index = span.original_range().start; + Some((index, Span::into(span))) + } +} + +impl<H, S> MatchIndicesInternal<H, S> +where + H: Haystack, + S: ReverseSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + #[inline] + fn next_back(&mut self) -> Option<(<H::Target as Hay>::Index, H)> { + let span = self.inner.next_back_spanned()?; + let index = span.original_range().start; + Some((index, Span::into(span))) + } +} + +generate_pattern_iterators! { + forward: + /// Created with the function [`match_indices`](fn.match_indices.html). + struct MatchIndices; + reverse: + /// Created with the function [`rmatch_indices`](fn.rmatch_indices.html). + struct RMatchIndices; + stability: + internal: + MatchIndicesInternal yielding ((<H::Target as Hay>::Index, H)); + delegate double ended; +} + +/// An iterator over the disjoint matches of a needle within the haystack +/// as well as the index that the match starts at. +/// +/// For matches of `needle` within `haystack` that overlap, +/// only the indices corresponding to the first match are returned. +pub fn match_indices<H, P>(haystack: H, needle: P) -> MatchIndices<H, P::Searcher> +where + H: Haystack, + P: Needle<H>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + MatchIndices(MatchIndicesInternal { + inner: matches(haystack, needle).0, + }) +} + +/// An iterator over the disjoint matches of a needle within the haystack, +/// yielded in reverse order along with the index of the match. +/// +/// For matches of `needle` within `haystack` that overlap, +/// only the indices corresponding to the last match are returned. +pub fn rmatch_indices<H, P>(haystack: H, needle: P) -> RMatchIndices<H, P::Searcher> +where + H: Haystack, + P: Needle<H>, + P::Searcher: ReverseSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + RMatchIndices(MatchIndicesInternal { + inner: rmatches(haystack, needle).0, + }) +} + +/// Returns the start index of first slice of the haystack that matches the needle. +/// +/// Returns [`None`] if the pattern doesn't match. +#[inline] +pub fn find<H, P>(haystack: H, needle: P) -> Option<<H::Target as Hay>::Index> +where + H: Haystack, + P: Needle<H>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + needle.into_searcher() + .search((*haystack).into()) + .map(|r| r.start) +} + +/// Returns the start index of last slice of the haystack that matches the needle. +/// +/// Returns [`None`] if the pattern doesn't match. +pub fn rfind<H, P>(haystack: H, needle: P) -> Option<<H::Target as Hay>::Index> +where + H: Haystack, + P: Needle<H>, + P::Searcher: ReverseSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + needle.into_searcher() + .rsearch((*haystack).into()) + .map(|r| r.start) +} + +//------------------------------------------------------------------------------ +// MatchRanges +//------------------------------------------------------------------------------ + +struct MatchRangesInternal<H, S> +where + H: Haystack, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + inner: MatchesInternal<H, S>, +} + +generate_clone_and_debug!(MatchRangesInternal, inner); + +impl<H, S> MatchRangesInternal<H, S> +where + H: Haystack, + S: Searcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + #[inline] + fn next(&mut self) -> Option<(Range<<H::Target as Hay>::Index>, H)> { + let span = self.inner.next_spanned()?; + let range = span.original_range(); + Some((range, Span::into(span))) + } +} + +impl<H, S> MatchRangesInternal<H, S> +where + H: Haystack, + S: ReverseSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + #[inline] + fn next_back(&mut self) -> Option<(Range<<H::Target as Hay>::Index>, H)> { + let span = self.inner.next_back_spanned()?; + let range = span.original_range(); + Some((range, Span::into(span))) + } +} + +generate_pattern_iterators! { + forward: + /// Created with the function [`match_ranges`](fn.match_ranges.html). + struct MatchRanges; + reverse: + /// Created with the function [`rmatch_ranges`](fn.rmatch_ranges.html). + struct RMatchRanges; + stability: + internal: + MatchRangesInternal yielding ((Range<<H::Target as Hay>::Index>, H)); + delegate double ended; +} + +/// An iterator over the disjoint matches of a needle within the haystack +/// as well as the index ranges of each match. +/// +/// For matches of `needle` within `haystack` that overlap, +/// only the ranges corresponding to the first match are returned. +pub fn match_ranges<H, P>(haystack: H, needle: P) -> MatchRanges<H, P::Searcher> +where + H: Haystack, + P: Needle<H>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + MatchRanges(MatchRangesInternal { + inner: matches(haystack, needle).0, + }) +} + +/// An iterator over the disjoint matches of a needle within the haystack, +/// yielded in reverse order along with the index range of the match. +/// +/// For matches of `needle` within `haystack` that overlap, +/// only the ranges corresponding to the last match are returned. +pub fn rmatch_ranges<H, P>(haystack: H, needle: P) -> RMatchRanges<H, P::Searcher> +where + H: Haystack, + P: Needle<H>, + P::Searcher: ReverseSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + RMatchRanges(MatchRangesInternal { + inner: rmatches(haystack, needle).0, + }) +} + +/// Returns the index range of first slice of the haystack that matches the needle. +/// +/// Returns [`None`] if the pattern doesn't match. +pub fn find_range<H, P>(haystack: H, needle: P) -> Option<Range<<H::Target as Hay>::Index>> +where + H: Haystack, + P: Needle<H>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + needle.into_searcher() + .search((*haystack).into()) +} + +/// Returns the start index of last slice of the haystack that matches the needle. +/// +/// Returns [`None`] if the pattern doesn't match. +pub fn rfind_range<H, P>(haystack: H, needle: P) -> Option<Range<<H::Target as Hay>::Index>> +where + H: Haystack, + P: Needle<H>, + P::Searcher: ReverseSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + needle.into_searcher() + .rsearch((*haystack).into()) +} + +//------------------------------------------------------------------------------ +// Split +//------------------------------------------------------------------------------ + +#[derive(Debug, Clone)] +struct SplitInternal<H, S> +where + H: Haystack, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + searcher: S, + rest: Span<H>, + finished: bool, + allow_trailing_empty: bool, +} + +impl<H, S> SplitInternal<H, S> +where + H: Haystack, + S: Searcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + #[inline] + fn next(&mut self) -> Option<H> { + if self.finished { + return None; + } + + let mut rest = self.rest.take(); + match self.searcher.search(rest.borrow()) { + Some(subrange) => { + let [left, _, right] = unsafe { rest.split_around(subrange) }; + self.rest = right; + rest = left; + } + None => { + self.finished = true; + if !self.allow_trailing_empty && rest.is_empty() { + return None; + } + } + } + Some(Span::into(rest)) + } +} + +impl<H, S> SplitInternal<H, S> +where + H: Haystack, + S: ReverseSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + #[inline] + fn next_back(&mut self) -> Option<H> { + if self.finished { + return None; + } + + let rest = self.rest.take(); + let after = match self.searcher.rsearch(rest.borrow()) { + Some(range) => { + let [left, _, right] = unsafe { rest.split_around(range) }; + self.rest = left; + right + } + None => { + self.finished = true; + rest + } + }; + + if !self.allow_trailing_empty { + self.allow_trailing_empty = true; + if after.is_empty() { + return self.next_back(); + } + } + + Some(Span::into(after)) + } +} + +generate_pattern_iterators! { + forward: + /// Created with the function [`split`](fn.split.html). + struct Split; + reverse: + /// Created with the function [`rsplit`](fn.rsplit.html). + struct RSplit; + stability: + internal: + SplitInternal yielding (H); + delegate double ended; +} + +generate_pattern_iterators! { + forward: + /// Created with the function [`split_terminator`](fn.split_terminator.html). + struct SplitTerminator; + reverse: + /// Created with the function [`rsplit_terminator`](fn.rsplit_terminator.html). + struct RSplitTerminator; + stability: + internal: + SplitInternal yielding (H); + delegate double ended; +} + +/// An iterator over slices of the haystack, separated by parts matched by the needle. +pub fn split<H, P>(haystack: H, needle: P) -> Split<H, P::Searcher> +where + H: Haystack, + P: Needle<H>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + Split(SplitInternal { + searcher: needle.into_searcher(), + rest: haystack.into(), + finished: false, + allow_trailing_empty: true, + }) +} + +/// An iterator over slices of the haystack, separated by parts matched by the needle +/// and yielded in reverse order. +pub fn rsplit<H, P>(haystack: H, needle: P) -> RSplit<H, P::Searcher> +where + H: Haystack, + P: Needle<H>, + P::Searcher: ReverseSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + RSplit(SplitInternal { + searcher: needle.into_searcher(), + rest: haystack.into(), + finished: false, + allow_trailing_empty: true, + }) +} + +/// An iterator over slices of the haystack, separated by parts matched by the needle. +/// +/// Equivalent to [`split`](fn.split.html), except that the trailing slice is skipped if empty. +/// +/// This method can be used for haystack data that is *terminated*, +/// rather than *separated* by a needle. +pub fn split_terminator<H, P>(haystack: H, needle: P) -> SplitTerminator<H, P::Searcher> +where + H: Haystack, + P: Needle<H>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + SplitTerminator(SplitInternal { + searcher: needle.into_searcher(), + rest: haystack.into(), + finished: false, + allow_trailing_empty: false, + }) +} + +/// An iterator over slices of the haystack, separated by parts matched by the needle +/// and yielded in reverse order. +/// +/// Equivalent to [`rsplit`](fn.rsplit.html), except that the trailing slice is skipped if empty. +/// +/// This method can be used for haystack data that is *terminated*, +/// rather than *separated* by a needle. +pub fn rsplit_terminator<H, P>(haystack: H, needle: P) -> RSplitTerminator<H, P::Searcher> +where + H: Haystack, + P: Needle<H>, + P::Searcher: ReverseSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + RSplitTerminator(SplitInternal { + searcher: needle.into_searcher(), + rest: haystack.into(), + finished: false, + allow_trailing_empty: false, + }) +} + +//------------------------------------------------------------------------------ +// SplitN +//------------------------------------------------------------------------------ + +#[derive(Clone, Debug)] +struct SplitNInternal<H, S> +where + H: Haystack, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + searcher: S, + rest: Span<H>, + n: usize, +} + +impl<H, S> SplitNInternal<H, S> +where + H: Haystack, + S: Searcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + #[inline] + fn next(&mut self) -> Option<H> { + let mut rest = self.rest.take(); + match self.n { + 0 => { + return None; + } + 1 => { + self.n = 0; + } + n => { + match self.searcher.search(rest.borrow()) { + Some(range) => { + let [left, _, right] = unsafe { rest.split_around(range) }; + self.n = n - 1; + self.rest = right; + rest = left; + } + None => { + self.n = 0; + } + } + } + } + Some(Span::into(rest)) + } +} + +impl<H, S> SplitNInternal<H, S> +where + H: Haystack, + S: ReverseSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + #[inline] + fn next_back(&mut self) -> Option<H> { + let mut rest = self.rest.take(); + match self.n { + 0 => { + return None; + } + 1 => { + self.n = 0; + } + n => { + match self.searcher.rsearch(rest.borrow()) { + Some(range) => { + let [left, _, right] = unsafe { rest.split_around(range) }; + self.n = n - 1; + self.rest = left; + rest = right; + } + None => { + self.n = 0; + } + } + } + } + Some(Span::into(rest)) + } +} + +generate_pattern_iterators! { + forward: + /// Created with the function [`splitn`](fn.splitn.html). + struct SplitN; + reverse: + /// Created with the function [`rsplitn`](fn.rsplitn.html). + struct RSplitN; + stability: + internal: + SplitNInternal yielding (H); + delegate single ended; +} + +/// An iterator over slices of the given haystack, separated by a needle, +/// restricted to returning at most `n` items. +/// +/// If `n` slices are returned, +/// the last slice (the `n`th slice) will contain the remainder of the haystack. +pub fn splitn<H, P>(haystack: H, n: usize, needle: P) -> SplitN<H, P::Searcher> +where + H: Haystack, + P: Needle<H>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + SplitN(SplitNInternal { + searcher: needle.into_searcher(), + rest: haystack.into(), + n, + }) +} + +/// An iterator over slices of the given haystack, separated by a needle, +/// starting from the end of the haystack, restricted to returning at most `n` items. +/// +/// If `n` slices are returned, +/// the last slice (the `n`th slice) will contain the remainder of the haystack. +pub fn rsplitn<H, P>(haystack: H, n: usize, needle: P) -> RSplitN<H, P::Searcher> +where + H: Haystack, + P: Needle<H>, + P::Searcher: ReverseSearcher<H::Target>, + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + RSplitN(SplitNInternal { + searcher: needle.into_searcher(), + rest: haystack.into(), + n, + }) +} + +//------------------------------------------------------------------------------ +// Replace +//------------------------------------------------------------------------------ + +/// Replaces all matches of a needle with another haystack. +pub fn replace_with<H, P, F, W>(src: H, from: P, mut replacer: F, mut writer: W) +where + H: Haystack, + P: Needle<H>, + F: FnMut(H) -> H, + W: FnMut(H), + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + let mut searcher = from.into_searcher(); + let mut src = Span::from(src); + while let Some(range) = searcher.search(src.borrow()) { + let [left, middle, right] = unsafe { src.split_around(range) }; + writer(Span::into(left)); + writer(replacer(Span::into(middle))); + src = right; + } + writer(Span::into(src)); +} + +/// Replaces first `n` matches of a needle with another haystack. +pub fn replacen_with<H, P, F, W>(src: H, from: P, mut replacer: F, mut n: usize, mut writer: W) +where + H: Haystack, + P: Needle<H>, + F: FnMut(H) -> H, + W: FnMut(H), + H::Target: Hay, // FIXME: RFC 2089 or 2289 +{ + let mut searcher = from.into_searcher(); + let mut src = Span::from(src); + loop { + if n == 0 { + break; + } + n -= 1; + if let Some(range) = searcher.search(src.borrow()) { + let [left, middle, right] = unsafe { src.split_around(range) }; + writer(Span::into(left)); + writer(replacer(Span::into(middle))); + src = right; + } else { + break; + } + } + writer(Span::into(src)); +} diff --git a/src/libcore/needle/haystack.rs b/src/libcore/needle/haystack.rs new file mode 100644 index 0000000000000..4de8d8583adee --- /dev/null +++ b/src/libcore/needle/haystack.rs @@ -0,0 +1,726 @@ +use crate::fmt::Debug; +use crate::ops::{Deref, Range}; + +/// Borrowed `Haystack`. +/// +/// Every [`Haystack`](trait.Haystack.html) type can be borrowed as references +/// to `Hay` types. This allows multiple similar types to share the same +/// implementation (e.g. the haystacks `&[T]` and `&mut [T]` both have the same +/// corresponding hay type `[T]`). +/// +/// In the other words, a `Haystack` is a generalized reference to `Hay`. +/// `Hay`s are typically implemented on unsized slice types like `str` and `[T]`. +/// +/// # Safety +/// +/// This trait is unsafe as there are some unchecked requirements which the +/// implementor must uphold. Failing to meet these requirements would lead to +/// out-of-bound access. The safety requirements are written in each member of +/// this trait. +pub unsafe trait Hay { + /// The index type of the haystack. Typically a `usize`. + /// + /// Splitting a hay must be sublinear using this index type. For instance, + /// if we implement `Hay` for a linked list, the index should not be an + /// integer offset (`usize`) as this would require O(n) time to chase the + /// pointer and find the split point. Instead, for a linked list we should + /// directly use the node pointer as the index. + /// + /// # Safety + /// + /// Valid indices of a single hay have a total order, even this type does + /// not require an `Ord` bound — for instance, to order two linked list + /// cursors, we need to chase the links and see if they meet; this is slow + /// and not suitable for implementing `Ord`, but conceptually an ordering + /// can be defined on linked list cursors. + type Index: Copy + Debug + Eq; + + /// Creates an empty hay. + /// + /// # Safety + /// + /// An empty hay's start and end indices must be the same, e.g. + /// + /// ```rust + /// #![feature(needle)] + /// use std::needle::Hay; + /// + /// let empty = <str>::empty(); + /// assert_eq!(empty.start_index(), empty.end_index()); + /// ``` + /// + /// This also suggests that there is exactly one valid index for an empty + /// hay. + /// + /// There is no guarantee that two separate calls to `.empty()` will produce + /// the same hay reference. + fn empty<'a>() -> &'a Self; + + /// Obtains the index to the start of the hay. + /// + /// Usually this method returns `0`. + /// + /// # Safety + /// + /// Implementation must ensure that the start index of hay is the first + /// valid index, i.e. for all valid indices `i` of `self`, we have + /// `self.start_index() <= i`. + fn start_index(&self) -> Self::Index; + + /// Obtains the index to the end of the hay. + /// + /// Usually this method returns the length of the hay. + /// + /// # Safety + /// + /// Implementation must ensure that the end index of hay is the last valid + /// index, i.e. for all valid indices `i` of `self`, we have + /// `i <= self.end_index()`. + fn end_index(&self) -> Self::Index; + + /// Returns the next immediate index in this hay. + /// + /// # Safety + /// + /// The `index` must be a valid index, and also must not equal to + /// `self.end_index()`. + /// + /// Implementation must ensure that if `j = self.next_index(i)`, then `j` + /// is also a valid index satisfying `j > i`. + /// + /// # Examples + /// + /// ```rust + /// #![feature(needle)] + /// use std::needle::Hay; + /// + /// let sample = "A→😀"; + /// unsafe { + /// assert_eq!(sample.next_index(0), 1); + /// assert_eq!(sample.next_index(1), 4); + /// assert_eq!(sample.next_index(4), 8); + /// } + /// ``` + unsafe fn next_index(&self, index: Self::Index) -> Self::Index; + + /// Returns the previous immediate index in this hay. + /// + /// # Safety + /// + /// The `index` must be a valid index, and also must not equal to + /// `self.start_index()`. + /// + /// Implementation must ensure that if `j = self.prev_index(i)`, then `j` + /// is also a valid index satisfying `j < i`. + /// + /// # Examples + /// + /// ```rust + /// #![feature(needle)] + /// use std::needle::Hay; + /// + /// let sample = "A→😀"; + /// unsafe { + /// assert_eq!(sample.prev_index(8), 4); + /// assert_eq!(sample.prev_index(4), 1); + /// assert_eq!(sample.prev_index(1), 0); + /// } + /// ``` + unsafe fn prev_index(&self, index: Self::Index) -> Self::Index; + + /// Obtains a child hay by slicing `self`. + /// + /// # Safety + /// + /// The two ends of the range must be valid indices. The start of the range + /// must be before the end of the range (`range.start <= range.end`). + unsafe fn slice_unchecked(&self, range: Range<Self::Index>) -> &Self; +} + +/// Linear splittable structure. +/// +/// A `Haystack` is implemented for reference and collection types such as +/// `&str`, `&mut [T]` and `Vec<T>`. Every haystack can be borrowed as an +/// underlying representation called a [`Hay`](trait.Hay.html). +/// Multiple haystacks may share the same hay type, and thus share the same +/// implementation of string search algorithms. +/// +/// In the other words, a `Haystack` is a generalized reference to `Hay`. +/// +/// # Safety +/// +/// This trait is unsafe as there are some unchecked requirements which the +/// implementor must uphold. Failing to meet these requirements would lead to +/// out-of-bound access. The safety requirements are written in each member of +/// this trait. +pub unsafe trait Haystack: Deref + Sized where Self::Target: Hay { + /// Creates an empty haystack. + fn empty() -> Self; + + /// Splits the haystack into three slices around the given range. + /// + /// This method splits `self` into three non-overlapping parts: + /// + /// 1. Before the range (`self[..range.start]`), + /// 2. Inside the range (`self[range]`), and + /// 3. After the range (`self[range.end..]`) + /// + /// The returned array contains these three parts in order. + /// + /// # Safety + /// + /// Caller should ensure that the starts and end indices of `range` are + /// valid indices for the haystack `self` with `range.start <= range.end`. + /// + /// If the haystack is a mutable reference (`&mut A`), implementation must + /// ensure that the 3 returned haystack are truly non-overlapping in memory. + /// This is required to uphold the "Aliasing XOR Mutability" guarantee. If a + /// haystack cannot be physically split into non-overlapping parts (e.g. in + /// `OsStr`), then `&mut A` should not implement `Haystack` either. + /// + /// # Examples + /// + /// ```rust + /// #![feature(needle)] + /// use std::needle::Haystack; + /// + /// let haystack = &mut [0, 1, 2, 3, 4, 5, 6]; + /// let [left, middle, right] = unsafe { haystack.split_around(2..6) }; + /// assert_eq!(left, &mut [0, 1]); + /// assert_eq!(middle, &mut [2, 3, 4, 5]); + /// assert_eq!(right, &mut [6]); + /// ``` + unsafe fn split_around(self, range: Range<<Self::Target as Hay>::Index>) -> [Self; 3]; + + /// Subslices this haystack. + /// + /// # Safety + /// + /// The starts and end indices of `range` must be valid indices for the + /// haystack `self` with `range.start <= range.end`. + unsafe fn slice_unchecked(self, range: Range<<Self::Target as Hay>::Index>) -> Self { + let [_, middle, _] = self.split_around(range); + middle + } + + /// Transforms the range from relative to self's parent to the original + /// haystack it was sliced from. + /// + /// Typically this method can be simply implemented as + /// + /// ```text + /// (original.start + parent.start)..(original.start + parent.end) + /// ``` + /// + /// If this haystack is a [`SharedHaystack`](trait.SharedHaystack.html), + /// this method should never be called, and calling it would cause an + /// unreachable panic. + /// + /// # Safety + /// + /// The `parent` range should be a valid range relative to a hay *a*, which + /// was used to slice out *self*: `self == &a[parent]`. + /// + /// Similarly, the `original` range should be a valid range relative to + /// another hay *b* used to slice out *a*: `a == &b[original]`. + /// + /// The distance of `parent` must be consistent with the length of `self`. + /// + /// This method should return a range which satisfies: + /// + /// ```text + /// self == &b[parent][original] == &b[range] + /// ``` + /// + /// Slicing can be destructive and *invalidates* some indices, in particular + /// for owned type with a pointer-like index, e.g. linked list. In this + /// case, one should derive an entirely new index range from `self`, e.g. + /// returning `self.start_index()..self.end_index()`. + /// + /// # Examples + /// + /// ```rust + /// #![feature(needle)] + /// use std::needle::Haystack; + /// + /// let mut hay = *b"This is a sample haystack"; + /// let restored = { + /// let this = &mut hay[2..23][3..19]; + /// assert_eq!(b"is a sample hays", this); + /// this.restore_range(2..23, 3..19) + /// }; + /// assert_eq!(b"is a sample hays", &hay[restored]); + /// ``` + fn restore_range( + &self, + original: Range<<Self::Target as Hay>::Index>, + parent: Range<<Self::Target as Hay>::Index>, + ) -> Range<<Self::Target as Hay>::Index>; +} + +/// A [`Haystack`](trait.Haystack.html) which can be shared and cheaply cloned +/// (e.g. `&H`, `Rc<H>`). +/// +/// If a haystack implements this marker trait, during internal operations the +/// original haystack will be retained in full and cloned, rather than being +/// sliced and splitted. Being a shared haystack allows searcher to see the +/// entire haystack, including the consumed portion. +pub trait SharedHaystack: Haystack + Clone +where Self::Target: Hay // FIXME: RFC 2089 or 2289 +{} + +/// The borrowing behavior differs between a (unique) haystack and shared +/// haystack. We use *specialization* to distinguish between these behavior: +/// +/// * When using `split_around()` and `slice_unchecked()` with a unique +/// haystack, the original haystack will be splitted or sliced accordingly +/// to maintain unique ownership. +/// * When using these functions with a shared haystack, the original haystack +/// will be cloned in full as that could provide more context into +/// searchers. +/// +/// This trait will never be public. +trait SpanBehavior: Haystack +where Self::Target: Hay // FIXME: RFC 2089 or 2289 +{ + fn take(&mut self) -> Self; + + fn from_span(span: Span<Self>) -> Self; + + unsafe fn split_around_for_span( + self, + subrange: Range<<Self::Target as Hay>::Index>, + ) -> [Self; 3]; + + unsafe fn slice_unchecked_for_span( + self, + subrange: Range<<Self::Target as Hay>::Index>, + ) -> Self; + + fn borrow_range( + &self, + range: Range<<Self::Target as Hay>::Index>, + ) -> Range<<Self::Target as Hay>::Index>; + + fn do_restore_range( + &self, + range: Range<<Self::Target as Hay>::Index>, + subrange: Range<<Self::Target as Hay>::Index>, + ) -> Range<<Self::Target as Hay>::Index>; +} + +impl<H: Haystack> SpanBehavior for H +where H::Target: Hay // FIXME: RFC 2089 or 2289 +{ + #[inline] + default fn take(&mut self) -> Self { + crate::mem::replace(self, Self::empty()) + } + + #[inline] + default fn from_span(span: Span<Self>) -> Self { + span.haystack + } + + #[inline] + default fn borrow_range( + &self, + _: Range<<Self::Target as Hay>::Index>, + ) -> Range<<Self::Target as Hay>::Index> { + self.start_index()..self.end_index() + } + + #[inline] + default fn do_restore_range( + &self, + range: Range<<Self::Target as Hay>::Index>, + subrange: Range<<Self::Target as Hay>::Index>, + ) -> Range<<Self::Target as Hay>::Index> { + self.restore_range(range, subrange) + } + + #[inline] + default unsafe fn split_around_for_span( + self, + subrange: Range<<Self::Target as Hay>::Index>, + ) -> [Self; 3] { + self.split_around(subrange) + } + + #[inline] + default unsafe fn slice_unchecked_for_span( + self, + subrange: Range<<Self::Target as Hay>::Index>, + ) -> Self { + self.slice_unchecked(subrange) + } +} + +impl<H: SharedHaystack> SpanBehavior for H +where H::Target: Hay // FIXME: RFC 2089 or 2289 +{ + #[inline] + fn take(&mut self) -> Self { + self.clone() + } + + #[inline] + fn from_span(span: Span<Self>) -> Self { + unsafe { + span.haystack.slice_unchecked(span.range) + } + } + + #[inline] + fn borrow_range( + &self, + range: Range<<Self::Target as Hay>::Index>, + ) -> Range<<Self::Target as Hay>::Index> { + range + } + + #[inline] + fn do_restore_range( + &self, + _: Range<<Self::Target as Hay>::Index>, + subrange: Range<<Self::Target as Hay>::Index>, + ) -> Range<<Self::Target as Hay>::Index> { + subrange + } + + #[inline] + unsafe fn split_around_for_span(self, _: Range<<Self::Target as Hay>::Index>) -> [Self; 3] { + [self.clone(), self.clone(), self] + } + + #[inline] + unsafe fn slice_unchecked_for_span(self, _: Range<<Self::Target as Hay>::Index>) -> Self { + self + } +} + +/// A span is a haystack coupled with the original range where the haystack is found. +/// +/// It can be considered as a tuple `(H, Range<H::Target::Index>)` +/// where the range is guaranteed to be valid for the haystack. +/// +/// # Examples +/// +/// ``` +/// #![feature(needle)] +/// use std::needle::Span; +/// +/// let orig_str = "Hello世界"; +/// let orig_span = Span::<&str>::from(orig_str); +/// +/// // slice a span. +/// let span = unsafe { orig_span.slice_unchecked(3..8) }; +/// +/// // further slicing (note the range is relative to the original span) +/// let subspan = unsafe { span.slice_unchecked(4..8) }; +/// +/// // obtains the substring. +/// let substring: &str = subspan.into(); +/// assert_eq!(substring, "o世"); +/// ``` +/// +/// Visualizing the spans: +/// +/// ```text +/// +/// 0 1 2 3 4 5 6 7 8 9 10 11 +/// +---+---+---+---+---+---+---+---+---+---+---+ +/// | H | e | l | l | o | U+4E16 | U+754C | orig_str +/// +---+---+---+---+---+---+---+---+---+---+---+ +/// +/// ^___________________________________________^ orig_span = (orig_str, 0..11) +/// +/// ^___________________^ span = (orig_str, 3..8) +/// +/// ^_______________^ subspan = (orig_str, 4..8) +/// ``` +#[derive(Debug, Clone)] +pub struct Span<H: Haystack> +where H::Target: Hay // FIXME: RFC 2089 or 2289 +{ + haystack: H, + range: Range<<<H as Deref>::Target as Hay>::Index>, + //^ The `<H as Trait>` is to trick `#[derive]` not to generate + // the where bound for `H::Target`. +} + +/// Creates a span which covers the entire haystack. +impl<H: Haystack> From<H> for Span<H> +where H::Target: Hay // FIXME: RFC 2089 or 2289 +{ + #[inline] + fn from(haystack: H) -> Self { + let range = haystack.start_index()..haystack.end_index(); + Self { haystack, range } + } +} + +/// Slices the original haystack to the focused range. +impl<H: Haystack> From<Span<H>> for H +where H::Target: Hay // FIXME: RFC 2089 or 2289 +{ + #[inline] + fn from(span: Span<H>) -> Self { + H::from_span(span) + } +} + +impl<H: SharedHaystack> Span<H> +where H::Target: Hay // FIXME: RFC 2089 or 2289 +{ + /// Decomposes this span into the original haystack, and the range it focuses on. + #[inline] + pub fn into_parts(self) -> (H, Range<<H::Target as Hay>::Index>) { + (self.haystack, self.range) + } + + /// Creates a span from a haystack, and a range it should focus on. + /// + /// # Safety + /// + /// The `range` must be a valid range relative to `haystack`. + #[inline] + pub unsafe fn from_parts(haystack: H, range: Range<<H::Target as Hay>::Index>) -> Self { + Self { haystack, range } + } +} + +impl<'h> Span<&'h str> { + /// Reinterprets the string span as a byte-array span. + #[inline] + pub fn as_bytes(self) -> Span<&'h [u8]> { + Span { + haystack: self.haystack.as_bytes(), + range: self.range, + } + } +} + +impl<H: Haystack> Span<H> +where H::Target: Hay // FIXME: RFC 2089 or 2289 +{ + /// The range of the span, relative to the ultimate original haystack it was sliced from. + #[inline] + pub fn original_range(&self) -> Range<<H::Target as Hay>::Index> { + self.range.clone() + } + + /// Borrows a shared span. + #[inline] + pub fn borrow(&self) -> Span<&H::Target> { + Span { + haystack: &*self.haystack, + range: self.haystack.borrow_range(self.range.clone()), + } + } + + /// Checks whether this span is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.range.start == self.range.end + } + + /// Returns this span by value, and replaces the original span by an empty + /// span. + #[inline] + pub fn take(&mut self) -> Self { + let haystack = self.haystack.take(); + let range = self.range.clone(); + self.range.end = self.range.start; + Span { haystack, range } + } + + /// Splits this span into three spans around the given range. + /// + /// # Safety + /// + /// `subrange` must be a valid range relative to `self.borrow()`. A safe + /// usage is like: + /// + /// ```rust + /// # #![feature(needle)] + /// # use std::needle::{Span, Needle, Searcher}; + /// # let span = Span::from("foo"); + /// # let mut searcher = <&str as Needle<&str>>::into_searcher("o"); + /// # (|| -> Option<()> { + /// let range = searcher.search(span.borrow())?; + /// let [left, middle, right] = unsafe { span.split_around(range) }; + /// # Some(()) })(); + /// ``` + #[inline] + pub unsafe fn split_around(self, subrange: Range<<H::Target as Hay>::Index>) -> [Self; 3] { + let self_range = self.haystack.borrow_range(self.range.clone()); + let [left, middle, right] = self.haystack.split_around_for_span(subrange.clone()); + + let left_range = left.do_restore_range(self.range.clone(),self_range.start..subrange.start); + let right_range = right.do_restore_range(self.range.clone(), subrange.end..self_range.end); + let middle_range = middle.do_restore_range(self.range, subrange); + + [ + Self { haystack: left, range: left_range }, + Self { haystack: middle, range: middle_range }, + Self { haystack: right, range: right_range }, + ] + } + + /// Slices this span to the given range. + /// + /// # Safety + /// + /// `subrange` must be a valid range relative to `self.borrow()`. + #[inline] + pub unsafe fn slice_unchecked(self, subrange: Range<<H::Target as Hay>::Index>) -> Self { + let haystack = self.haystack.slice_unchecked_for_span(subrange.clone()); + let range = haystack.do_restore_range(self.range, subrange); + Self { haystack, range } + } +} + +unsafe impl<'a, A: Hay + ?Sized + 'a> Haystack for &'a A { + #[inline] + fn empty() -> Self { + A::empty() + } + + #[inline] + unsafe fn split_around(self, range: Range<A::Index>) -> [Self; 3] { + [ + self.slice_unchecked(self.start_index()..range.start), + self.slice_unchecked(range.clone()), + self.slice_unchecked(range.end..self.end_index()), + ] + } + + #[inline] + unsafe fn slice_unchecked(self, range: Range<A::Index>) -> Self { + A::slice_unchecked(self, range) + } + + #[inline] + fn restore_range(&self, _: Range<A::Index>, _: Range<A::Index>) -> Range<A::Index> { + unreachable!() + } +} + +impl<'a, A: Hay + ?Sized + 'a> SharedHaystack for &'a A {} + +unsafe impl Hay for str { + type Index = usize; + + #[inline] + fn empty<'a>() -> &'a Self { + "" + } + + #[inline] + fn start_index(&self) -> usize { + 0 + } + + #[inline] + fn end_index(&self) -> usize { + self.len() + } + + #[inline] + unsafe fn slice_unchecked(&self, range: Range<usize>) -> &Self { + self.get_unchecked(range) + } + + #[inline] + unsafe fn next_index(&self, index: Self::Index) -> Self::Index { + index + self.get_unchecked(index..).chars().next().unwrap().len_utf8() + } + + #[inline] + unsafe fn prev_index(&self, index: Self::Index) -> Self::Index { + index - self.get_unchecked(..index).chars().next_back().unwrap().len_utf8() + } +} + +unsafe impl<'h> Haystack for &'h mut str { + #[inline] + fn empty() -> &'h mut str { + Self::default() + } + + #[inline] + unsafe fn slice_unchecked(self, range: Range<usize>) -> Self { + self.get_unchecked_mut(range) + } + + #[inline] + unsafe fn split_around(self, range: Range<usize>) -> [Self; 3] { + let (haystack, right) = self.split_at_mut(range.end); + let (left, middle) = haystack.split_at_mut(range.start); + [left, middle, right] + } + + #[inline] + fn restore_range(&self, range: Range<usize>, subrange: Range<usize>) -> Range<usize> { + (subrange.start + range.start)..(subrange.end + range.start) + } +} + +unsafe impl<T> Hay for [T] { + type Index = usize; + + #[inline] + fn empty<'a>() -> &'a Self { + &[] + } + + #[inline] + fn start_index(&self) -> usize { + 0 + } + + #[inline] + fn end_index(&self) -> usize { + self.len() + } + + #[inline] + unsafe fn slice_unchecked(&self, range: Range<usize>) -> &Self { + self.get_unchecked(range) + } + + #[inline] + unsafe fn next_index(&self, index: Self::Index) -> Self::Index { + index + 1 + } + + #[inline] + unsafe fn prev_index(&self, index: Self::Index) -> Self::Index { + index - 1 + } +} + +unsafe impl<'h, T: 'h> Haystack for &'h mut [T] { + #[inline] + fn empty() -> Self { + &mut [] + } + + #[inline] + unsafe fn slice_unchecked(self, range: Range<usize>) -> Self { + self.get_unchecked_mut(range) + } + + #[inline] + unsafe fn split_around(self, range: Range<usize>) -> [Self; 3] { + let (haystack, right) = self.split_at_mut(range.end); + let (left, middle) = haystack.split_at_mut(range.start); + [left, middle, right] + } + + #[inline] + fn restore_range(&self, range: Range<usize>, subrange: Range<usize>) -> Range<usize> { + (subrange.start + range.start)..(subrange.end + range.start) + } +} diff --git a/src/libcore/needle/mod.rs b/src/libcore/needle/mod.rs new file mode 100644 index 0000000000000..92a6f21a4c04a --- /dev/null +++ b/src/libcore/needle/mod.rs @@ -0,0 +1,39 @@ +#![unstable(feature = "needle", issue = "56345")] + +//! The Needle API, support generalized searching on strings, arrays and more. +//! +//! This module provides traits to facilitate searching [`Needle`] in a [`Haystack`]. +//! +//! [`Needle`]: trait.Needle.html +//! [`Haystack`]: trait.Haystack.html +//! +//! Haystacks +//! ========= +//! +//! A *haystack* refers to any linear structure which can be split or sliced +//! into smaller, non-overlapping parts. Examples are strings and vectors. +//! +//! ```rust +//! let haystack: &str = "hello"; // a string slice (`&str`) is a haystack. +//! let (a, b) = haystack.split_at(4); // it can be split into two strings. +//! let c = &a[1..3]; // it can be sliced. +//! ``` +//! +//! The minimal haystack which cannot be further sliced is called a *codeword*. +//! For instance, the codeword of a string would be a UTF-8 sequence. A haystack +//! can therefore be viewed as a consecutive list of codewords. +//! +//! The boundary between codewords can be addressed using an *index*. The +//! numbers 1, 3 and 4 in the snippet above are sample indices of a string. An +//! index is usually a `usize`. +//! +//! An arbitrary number may point outside of a haystack, or in the interior of a +//! codeword. These indices are invalid. A *valid index* of a certain haystack +//! would only point to the boundaries. + +mod haystack; +mod needle; +pub mod ext; + +pub use self::haystack::*; +pub use self::needle::*; diff --git a/src/libcore/needle/needle.rs b/src/libcore/needle/needle.rs new file mode 100644 index 0000000000000..e0cf8a906d225 --- /dev/null +++ b/src/libcore/needle/needle.rs @@ -0,0 +1,582 @@ +use super::haystack::{Haystack, Hay, Span}; + +use crate::ops::Range; + +/// A searcher, for searching a [`Needle`](trait.Needle.html) from a +/// [`Hay`](trait.Hay.html). +/// +/// This trait provides methods for searching for non-overlapping matches of a +/// needle starting from the front (left) of a hay. +/// +/// # Safety +/// +/// This trait is marked unsafe because the range returned by its methods are +/// required to lie on valid codeword boundaries in the haystack. This enables +/// users of this trait to slice the haystack without additional runtime checks. +/// +/// # Examples +/// +/// Implement a searcher and consumer which matches `b"Aaaa"` from a byte string. +/// +/// ```rust +/// #![feature(needle)] +/// use std::needle::*; +/// use std::ops::Range; +/// +/// // The searcher for searching `b"Aaaa"`, using naive search. +/// // We are going to use this as a needle too. +/// struct Aaaa; +/// +/// unsafe impl Searcher<[u8]> for Aaaa { +/// // search for an `b"Aaaa"` in the middle of the string, returns its range. +/// fn search(&mut self, span: Span<&[u8]>) -> Option<Range<usize>> { +/// let (hay, range) = span.into_parts(); +/// +/// let start = range.start; +/// for (i, window) in hay[range].windows(4).enumerate() { +/// if *window == b"Aaaa"[..] { +/// // remember to include the range offset +/// return Some((start + i)..(start + i + 4)); +/// } +/// } +/// +/// None +/// } +/// } +/// +/// unsafe impl Consumer<[u8]> for Aaaa { +/// // checks if an `b"Aaaa" is at the beginning of the string, returns the end index. +/// fn consume(&mut self, span: Span<&[u8]>) -> Option<usize> { +/// let (hay, range) = span.into_parts(); +/// let end = range.start.checked_add(4)?; +/// if end <= range.end && hay[range.start..end] == b"Aaaa"[..] { +/// Some(end) +/// } else { +/// None +/// } +/// } +/// } +/// +/// impl<H: Haystack<Target = [u8]>> Needle<H> for Aaaa { +/// type Searcher = Self; +/// type Consumer = Self; +/// fn into_searcher(self) -> Self { self } +/// fn into_consumer(self) -> Self { self } +/// } +/// +/// // test with some standard algorithms. +/// let haystack = &b"Aaaaa!!!Aaa!!!Aaaaaaaaa!!!"[..]; +/// assert_eq!( +/// ext::split(haystack, Aaaa).collect::<Vec<_>>(), +/// vec![ +/// &b""[..], +/// &b"a!!!Aaa!!!"[..], +/// &b"aaaaa!!!"[..], +/// ] +/// ); +/// assert_eq!( +/// ext::match_ranges(haystack, Aaaa).collect::<Vec<_>>(), +/// vec![ +/// (0..4, &b"Aaaa"[..]), +/// (14..18, &b"Aaaa"[..]), +/// ] +/// ); +/// assert_eq!( +/// ext::trim_start(haystack, Aaaa), +/// &b"a!!!Aaa!!!Aaaaaaaaa!!!"[..] +/// ); +/// ``` +pub unsafe trait Searcher<A: Hay + ?Sized> { + /// Searches for the first range which the needle can be found in the span. + /// + /// This method is used to support the following standard algorithms: + /// + /// * [`matches`](ext/fn.matches.html) + /// * [`contains`](ext/fn.contains.html) + /// * [`match_indices`](ext/fn.match_indices.html) + /// * [`find`](ext/fn.find.html) + /// * [`match_ranges`](ext/fn.match_ranges.html) + /// * [`find_range`](ext/fn.find_range.html) + /// * [`split`](ext/fn.split.html) + /// * [`split_terminator`](ext/fn.split_terminator.html) + /// * [`splitn`](ext/fn.splitn.html) + /// * [`replace_with`](ext/fn.replace_with.html) + /// * [`replacen_with`](ext/fn.replacen_with.html) + /// + /// The hay and the restricted range for searching can be recovered by + /// calling `span`[`.into_parts()`](struct.Span.html#method.into_parts). + /// The range returned by this method should be relative to the hay and + /// must be contained within the restricted range from the span. + /// + /// If the needle is not found, this method should return `None`. + /// + /// The reason this method takes a `Span<&A>` instead of just `&A` is + /// because some needles need context information provided by + /// the position of the current slice and the content around the slice. + /// Regex components like the start-/end-of-text anchors `^`/`$` + /// and word boundary `\b` are primary examples. + /// + /// # Examples + /// + /// Search for the locations of a substring inside a string, using the + /// searcher primitive. + /// + /// ``` + /// #![feature(needle)] + /// use std::needle::{Searcher, Needle, Span}; + /// + /// let mut searcher = Needle::<&str>::into_searcher("::"); + /// let span = Span::from("lion::tiger::leopard"); + /// // ^ ^ ^ ^ + /// // string indices: 0 4 11 20 + /// + /// // found the first "::". + /// assert_eq!(searcher.search(span.clone()), Some(4..6)); + /// + /// // slice the span to skip the first match. + /// let span = unsafe { span.slice_unchecked(6..20) }; + /// + /// // found the second "::". + /// assert_eq!(searcher.search(span.clone()), Some(11..13)); + /// + /// // should find nothing now. + /// let span = unsafe { span.slice_unchecked(13..20) }; + /// assert_eq!(searcher.search(span.clone()), None); + /// ``` + fn search(&mut self, span: Span<&A>) -> Option<Range<A::Index>>; +} + +/// A consumer, for searching a [`Needle`](trait.Needle.html) from a +/// [`Hay`](trait.Hay.html) anchored at the beginnning. +/// +/// This trait provides methods for matching a needle anchored at the beginning +/// of a hay. +/// +/// See documentation of [`Searcher`](trait.Searcher.html) for an example. +/// +/// # Safety +/// +/// This trait is marked unsafe because the range returned by its methods are +/// required to lie on valid codeword boundaries in the haystack. This enables +/// users of this trait to slice the haystack without additional runtime checks. +pub unsafe trait Consumer<A: Hay + ?Sized> { + /// Checks if the needle can be found at the beginning of the span. + /// + /// This method is used to implement the standard algorithm + /// [`starts_with()`](ext/fn.starts_with.html) as well as providing + /// the default implementation for [`.trim_start()`](#method.trim_start). + /// + /// The hay and the restricted range for searching can be recovered by + /// calling `span`[`.into_parts()`](struct.Span.html#method.into_parts). + /// If a needle can be found starting at `range.start`, this method should + /// return the end index of the needle relative to the hay. + /// + /// If the needle cannot be found at the beginning of the span, this method + /// should return `None`. + /// + /// # Examples + /// + /// Consumes ASCII characters from the beginning. + /// + /// ``` + /// #![feature(needle)] + /// use std::needle::{Consumer, Needle, Span}; + /// + /// let mut consumer = Needle::<&str>::into_consumer(|c: char| c.is_ascii()); + /// let span = Span::from("Hi😋!!"); + /// + /// // consumes the first ASCII character + /// assert_eq!(consumer.consume(span.clone()), Some(1)); + /// + /// // slice the span to skip the first match. + /// let span = unsafe { span.slice_unchecked(1..8) }; + /// + /// // matched the second ASCII character + /// assert_eq!(consumer.consume(span.clone()), Some(2)); + /// + /// // should match nothing now. + /// let span = unsafe { span.slice_unchecked(2..8) }; + /// assert_eq!(consumer.consume(span.clone()), None); + /// ``` + fn consume(&mut self, span: Span<&A>) -> Option<A::Index>; + + /// Repeatedly removes prefixes of the hay which matches the needle. + /// + /// This method is used to implement the standard algorithm + /// [`trim_start()`](ext/fn.trim_start.html). + /// + /// Returns the start index of the slice after all prefixes are removed. + /// + /// A fast generic implementation in terms of + /// [`.consume()`](#method.consume) is provided by default. Nevertheless, + /// many needles allow a higher-performance specialization. + /// + /// # Examples + /// + /// ```rust + /// #![feature(needle)] + /// use std::needle::{Consumer, Needle}; + /// + /// let mut consumer = Needle::<&str>::into_consumer('x'); + /// assert_eq!(consumer.trim_start("xxxyy"), 3); + /// + /// let mut consumer = Needle::<&str>::into_consumer('x'); + /// assert_eq!(consumer.trim_start("yyxxx"), 0); + /// ``` + #[inline] + fn trim_start(&mut self, hay: &A) -> A::Index { + let mut offset = hay.start_index(); + let mut span = Span::from(hay); + while let Some(pos) = self.consume(span.clone()) { + offset = pos; + let (hay, range) = span.into_parts(); + if pos == range.start { + break; + } + span = unsafe { Span::from_parts(hay, pos..range.end) }; + } + offset + } +} + +/// A searcher which can be searched from the end. +/// +/// This trait provides methods for searching for non-overlapping matches of a +/// needle starting from the back (right) of a hay. +/// +/// # Safety +/// +/// This trait is marked unsafe because the range returned by its methods are +/// required to lie on valid codeword boundaries in the haystack. This enables +/// users of this trait to slice the haystack without additional runtime checks. +pub unsafe trait ReverseSearcher<A: Hay + ?Sized>: Searcher<A> { + /// Searches for the last range which the needle can be found in the span. + /// + /// This method is used to support the following standard algorithms: + /// + /// * [`rmatches`](ext/fn.rmatches.html) + /// * [`rmatch_indices`](ext/fn.rmatch_indices.html) + /// * [`rfind`](ext/fn.find.html) + /// * [`rmatch_ranges`](ext/fn.rmatch_ranges.html) + /// * [`rfind_range`](ext/fn.rfind_range.html) + /// * [`rsplit`](ext/fn.rsplit.html) + /// * [`rsplit_terminator`](ext/fn.rsplit_terminator.html) + /// * [`rsplitn`](ext/fn.rsplitn.html) + /// + /// The hay and the restricted range for searching can be recovered by + /// calling `span`[`.into_parts()`](struct.Span.html#method.into_parts). + /// The returned range should be relative to the hay and must be contained + /// within the restricted range from the span. + /// + /// If the needle is not found, this method should return `None`. + /// + /// # Examples + /// + /// Search for the locations of a substring inside a string, using the + /// searcher primitive. + /// + /// ``` + /// #![feature(needle)] + /// use std::needle::{ReverseSearcher, Needle, Span}; + /// + /// let mut searcher = Needle::<&str>::into_searcher("::"); + /// let span = Span::from("lion::tiger::leopard"); + /// // ^ ^ ^ + /// // string indices: 0 4 11 + /// + /// // found the last "::". + /// assert_eq!(searcher.rsearch(span.clone()), Some(11..13)); + /// + /// // slice the span to skip the last match. + /// let span = unsafe { span.slice_unchecked(0..11) }; + /// + /// // found the second to last "::". + /// assert_eq!(searcher.rsearch(span.clone()), Some(4..6)); + /// + /// // should found nothing now. + /// let span = unsafe { span.slice_unchecked(0..4) }; + /// assert_eq!(searcher.rsearch(span.clone()), None); + /// ``` + fn rsearch(&mut self, span: Span<&A>) -> Option<Range<A::Index>>; +} + +/// A consumer which can be searched from the end. +/// +/// This trait provides methods for matching a needle anchored at the end of a +/// hay. +/// +/// # Safety +/// +/// This trait is marked unsafe because the range returned by its methods are +/// required to lie on valid codeword boundaries in the haystack. This enables +/// users of this trait to slice the haystack without additional runtime checks. +pub unsafe trait ReverseConsumer<A: Hay + ?Sized>: Consumer<A> { + /// Checks if the needle can be found at the end of the span. + /// + /// This method is used to implement the standard algorithm + /// [`ends_with()`](ext/fn.ends_with.html) as well as providing the default + /// implementation for [`.trim_end()`](#method.trim_end). + /// + /// The hay and the restricted range for searching can be recovered by + /// calling `span`[`.into_parts()`](struct.Span.html#method.into_parts). + /// If a needle can be found ending at `range.end`, this method should + /// return the start index of the needle relative to the hay. + /// + /// If the needle cannot be found at the end of the span, this method + /// should return `None`. + /// + /// # Examples + /// + /// Consumes ASCII characters from the end. + /// + /// ``` + /// #![feature(needle)] + /// use std::needle::{ReverseConsumer, Needle, Span}; + /// + /// let mut consumer = Needle::<&str>::into_consumer(|c: char| c.is_ascii()); + /// let span = Span::from("Hi😋!!"); + /// + /// // consumes the last ASCII character + /// assert_eq!(consumer.rconsume(span.clone()), Some(7)); + /// + /// // slice the span to skip the first match. + /// let span = unsafe { span.slice_unchecked(0..7) }; + /// + /// // matched the second to last ASCII character + /// assert_eq!(consumer.rconsume(span.clone()), Some(6)); + /// + /// // should match nothing now. + /// let span = unsafe { span.slice_unchecked(0..6) }; + /// assert_eq!(consumer.rconsume(span.clone()), None); + /// ``` + fn rconsume(&mut self, hay: Span<&A>) -> Option<A::Index>; + + /// Repeatedly removes suffixes of the hay which matches the needle. + /// + /// This method is used to implement the standard algorithm + /// [`trim_end()`](ext/fn.trim_end.html). + /// + /// A fast generic implementation in terms of + /// [`.rconsume()`](#method.rconsume) is provided by default. + /// Nevertheless, many needles allow a higher-performance specialization. + /// + /// # Examples + /// + /// ```rust + /// #![feature(needle)] + /// use std::needle::{ReverseConsumer, Needle}; + /// + /// let mut consumer = Needle::<&str>::into_consumer('x'); + /// assert_eq!(consumer.trim_end("yyxxx"), 2); + /// + /// let mut consumer = Needle::<&str>::into_consumer('x'); + /// assert_eq!(consumer.trim_end("xxxyy"), 5); + /// ``` + #[inline] + fn trim_end(&mut self, hay: &A) -> A::Index { + let mut offset = hay.end_index(); + let mut span = Span::from(hay); + while let Some(pos) = self.rconsume(span.clone()) { + offset = pos; + let (hay, range) = span.into_parts(); + if pos == range.end { + break; + } + span = unsafe { Span::from_parts(hay, range.start..pos) }; + } + offset + } +} + +/// A searcher which can be searched from both end with consistent results. +/// +/// Implementing this marker trait enables the following standard algorithms to +/// return [`DoubleEndedIterator`](../iter/trait.DoubleEndedIterator.html)s: +/// +/// * [`matches`](ext/fn.matches.html) / +/// [`rmatches`](ext/fn.rmatches.html) +/// * [`match_indices`](ext/fn.match_indices.html) / +/// [`rmatch_indices`](ext/fn.rmatch_indices.html)` +/// * [`match_ranges`](ext/fn.match_ranges.html) / +/// [`rmatch_ranges`](ext/fn.rmatch_ranges.html) +/// * [`split`](ext/fn.split.html) / +/// [`rsplit`](ext/fn.rsplit.html) +/// * [`split_terminator`](ext/fn.split_terminator.html) / +/// [`rsplit_terminator`](ext/fn.rsplit_terminator.html) +/// * [`splitn`](ext/fn.splitn.html) / +/// [`rsplitn`](ext/fn.rsplitn.html) +/// +/// # Examples +/// +/// The searcher of a character implements `DoubleEndedSearcher`, while that of +/// a string does not. +/// +/// `match_indices` and `rmatch_indices` are reverse of each other only for a +/// `DoubleEndedSearcher`. +/// +/// ```rust +/// #![feature(needle)] +/// use std::needle::ext::{match_indices, rmatch_indices}; +/// +/// // `match_indices` and `rmatch_indices` are exact reverse of each other for a `char` needle. +/// let forward = match_indices("xxxxx", 'x').collect::<Vec<_>>(); +/// let mut rev_backward = rmatch_indices("xxxxx", 'x').collect::<Vec<_>>(); +/// rev_backward.reverse(); +/// +/// assert_eq!(forward, vec![(0, "x"), (1, "x"), (2, "x"), (3, "x"), (4, "x")]); +/// assert_eq!(rev_backward, vec![(0, "x"), (1, "x"), (2, "x"), (3, "x"), (4, "x")]); +/// assert_eq!(forward, rev_backward); +/// +/// // this property does not exist on a `&str` needle in general. +/// let forward = match_indices("xxxxx", "xx").collect::<Vec<_>>(); +/// let mut rev_backward = rmatch_indices("xxxxx", "xx").collect::<Vec<_>>(); +/// rev_backward.reverse(); +/// +/// assert_eq!(forward, vec![(0, "xx"), (2, "xx")]); +/// assert_eq!(rev_backward, vec![(1, "xx"), (3, "xx")]); +/// assert_ne!(forward, rev_backward); +/// ``` +pub unsafe trait DoubleEndedSearcher<A: Hay + ?Sized>: ReverseSearcher<A> {} + +/// A consumer which can be searched from both end with consistent results. +/// +/// It is used to support the following standard algorithm: +/// +/// * [`trim`](ext/fn.trim.html) +/// +/// The `trim` function is implemented by calling +/// [`trim_start`](ext/fn.trim_start.html) and [`trim_end`](ext/fn.trim_end.html) +/// together. This trait encodes the fact that we can call these two functions in any order. +/// +/// # Examples +/// +/// The consumer of a character implements `DoubleEndedConsumer`, while that of +/// a string does not. `trim` is implemented only for a `DoubleEndedConsumer`. +/// +/// ```rust +/// #![feature(needle)] +/// use std::needle::ext::{trim_start, trim_end, trim}; +/// +/// // for a `char`, we get the same trim result no matter which function is called first. +/// let trim_start_first = trim_end(trim_start("xyxyx", 'x'), 'x'); +/// let trim_end_first = trim_start(trim_end("xyxyx", 'x'), 'x'); +/// let trim_together = trim("xyxyx", 'x'); +/// assert_eq!(trim_start_first, "yxy"); +/// assert_eq!(trim_end_first, "yxy"); +/// assert_eq!(trim_together, "yxy"); +/// +/// // this property does not exist for a `&str` in general. +/// let trim_start_first = trim_end(trim_start("xyxyx", "xyx"), "xyx"); +/// let trim_end_first = trim_start(trim_end("xyxyx", "xyx"), "xyx"); +/// // let trim_together = trim("xyxyx", 'x'); // cannot be defined +/// assert_eq!(trim_start_first, "yx"); +/// assert_eq!(trim_end_first, "xy"); +/// // assert_eq!(trim_together, /*????*/); // cannot be defined +/// ``` +pub unsafe trait DoubleEndedConsumer<A: Hay + ?Sized>: ReverseConsumer<A> {} + +/// A needle, a type which can be converted into a searcher. +/// +/// When using search algorithms like [`split()`](ext/fn.split.html), users will +/// search with a `Needle` e.g. a `&str`. A needle is usually stateless, +/// however for efficient searching, we often need some preprocessing and +/// maintain a mutable state. The preprocessed structure is called the +/// [`Searcher`](trait.Searcher.html) of this needle. +/// +/// The relationship between `Searcher` and `Needle` is similar to `Iterator` +/// and `IntoIterator`. +pub trait Needle<H: Haystack>: Sized +where H::Target: Hay // FIXME: RFC 2089 or 2289 +{ + /// The searcher associated with this needle. + type Searcher: Searcher<H::Target>; + + /// The consumer associated with this needle. + type Consumer: Consumer<H::Target>; + + /// Produces a searcher for this needle. + fn into_searcher(self) -> Self::Searcher; + + /// Produces a consumer for this needle. + /// + /// Usually a consumer and a searcher can be the same type. + /// Some needles may require different types + /// when the two need different optimization strategies. String searching + /// is an example of this: we use the Two-Way Algorithm when searching for + /// substrings, which needs to preprocess the needle. However this is + /// irrelevant for consuming, which only needs to check for string equality + /// once. Therefore the Consumer for a string would be a distinct type + /// using naive search. + fn into_consumer(self) -> Self::Consumer; +} + +/// Searcher of an empty needle. +/// +/// This searcher will find all empty subslices between any codewords in a +/// haystack. +#[derive(Clone, Debug, Default)] +pub struct EmptySearcher { + consumed_start: bool, + consumed_end: bool, +} + +unsafe impl<A: Hay + ?Sized> Searcher<A> for EmptySearcher { + #[inline] + fn search(&mut self, span: Span<&A>) -> Option<Range<A::Index>> { + let (hay, range) = span.into_parts(); + let start = if !self.consumed_start { + self.consumed_start = true; + range.start + } else if range.start == range.end { + return None; + } else { + unsafe { hay.next_index(range.start) } + }; + Some(start..start) + } +} + +unsafe impl<A: Hay + ?Sized> Consumer<A> for EmptySearcher { + #[inline] + fn consume(&mut self, span: Span<&A>) -> Option<A::Index> { + let (_, range) = span.into_parts(); + Some(range.start) + } + + #[inline] + fn trim_start(&mut self, hay: &A) -> A::Index { + hay.start_index() + } +} + +unsafe impl<A: Hay + ?Sized> ReverseSearcher<A> for EmptySearcher { + #[inline] + fn rsearch(&mut self, span: Span<&A>) -> Option<Range<A::Index>> { + let (hay, range) = span.into_parts(); + let end = if !self.consumed_end { + self.consumed_end = true; + range.end + } else if range.start == range.end { + return None; + } else { + unsafe { hay.prev_index(range.end) } + }; + Some(end..end) + } +} + +unsafe impl<A: Hay + ?Sized> ReverseConsumer<A> for EmptySearcher { + #[inline] + fn rconsume(&mut self, span: Span<&A>) -> Option<A::Index> { + let (_, range) = span.into_parts(); + Some(range.end) + } + + #[inline] + fn trim_end(&mut self, hay: &A) -> A::Index { + hay.end_index() + } +} + +unsafe impl<A: Hay + ?Sized> DoubleEndedSearcher<A> for EmptySearcher {} +unsafe impl<A: Hay + ?Sized> DoubleEndedConsumer<A> for EmptySearcher {} diff --git a/src/libcore/slice/mod.rs b/src/libcore/slice/mod.rs index bf3dda48dc797..d3806e3171293 100644 --- a/src/libcore/slice/mod.rs +++ b/src/libcore/slice/mod.rs @@ -34,6 +34,9 @@ use crate::result::Result::{Ok, Err}; use crate::ptr; use crate::mem; use crate::marker::{Copy, Send, Sync, Sized, self}; +use crate::needle::{ + ext, Needle, Searcher, ReverseSearcher, Consumer, ReverseConsumer, DoubleEndedConsumer, +}; #[unstable(feature = "slice_internals", issue = "0", reason = "exposed from core to be reused in std; use the memchr crate")] @@ -43,6 +46,12 @@ pub mod memchr; mod rotate; mod sort; +/// Needle implementations for slices +#[unstable(feature = "slice_internals", issue = "0", + reason = "exposed from core to be reused in std")] +#[doc(hidden)] +pub mod needles; + #[repr(C)] union Repr<'a, T: 'a> { rust: &'a [T], @@ -1006,8 +1015,10 @@ impl<T> [T] { /// # Examples /// /// ``` + /// #![feature(slice_needle_methods)] + /// /// let slice = [10, 40, 33, 20]; - /// let mut iter = slice.split(|num| num % 3 == 0); + /// let mut iter = slice.split_match(|num: &i32| num % 3 == 0); /// /// assert_eq!(iter.next().unwrap(), &[10, 40]); /// assert_eq!(iter.next().unwrap(), &[20]); @@ -1020,8 +1031,10 @@ impl<T> [T] { /// iterator: /// /// ``` + /// #![feature(slice_needle_methods)] + /// /// let slice = [10, 40, 33]; - /// let mut iter = slice.split(|num| num % 3 == 0); + /// let mut iter = slice.split_match(|num: &i32| num % 3 == 0); /// /// assert_eq!(iter.next().unwrap(), &[10, 40]); /// assert_eq!(iter.next().unwrap(), &[]); @@ -1032,24 +1045,41 @@ impl<T> [T] { /// present between them: /// /// ``` + /// #![feature(slice_needle_methods)] + /// /// let slice = [10, 6, 33, 20]; - /// let mut iter = slice.split(|num| num % 3 == 0); + /// let mut iter = slice.split_match(|num: &i32| num % 3 == 0); /// /// assert_eq!(iter.next().unwrap(), &[10]); /// assert_eq!(iter.next().unwrap(), &[]); /// assert_eq!(iter.next().unwrap(), &[20]); /// assert!(iter.next().is_none()); /// ``` + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn split_match<'a, F>(&'a self, pred: F) -> ext::Split<&'a [T], F::Searcher> + where + F: Needle<&'a [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::split(self, pred) + } + + /// Returns an iterator over subslices separated by elements that match + /// `pred`. The matched element is not contained in the subslices. + /// + /// This method is the stable equivalent of + /// [`split_match`](#method.split_match), except it only accepts closures + /// instead of all kinds of needles. See documentations of `split_match` for + /// usage examples. #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn split<F>(&self, pred: F) -> Split<'_, T, F> - where F: FnMut(&T) -> bool + where + F: FnMut(&T) -> bool, { - Split { - v: self, - pred, - finished: false - } + ext::split(self, pred) } /// Returns an iterator over mutable subslices separated by elements that @@ -1058,19 +1088,40 @@ impl<T> [T] { /// # Examples /// /// ``` + /// #![feature(slice_needle_methods)] + /// /// let mut v = [10, 40, 30, 20, 60, 50]; /// - /// for group in v.split_mut(|num| *num % 3 == 0) { + /// for group in v.split_match_mut(|num: &i32| *num % 3 == 0) { /// group[0] = 1; /// } /// assert_eq!(v, [1, 40, 30, 1, 60, 1]); /// ``` + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn split_match_mut<'a, F>(&'a mut self, pred: F) -> ext::Split<&'a mut [T], F::Searcher> + where + F: Needle<&'a mut [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::split(self, pred) + } + + /// Returns an iterator over mutable subslices separated by elements that + /// match `pred`. The matched element is not contained in the subslices. + /// + /// This method is the stable equivalent of + /// [`split_match_mut`](#method.split_match_mut), except it only accepts + /// closures instead of all kinds of needles. See documentations of + /// `split_match_mut` for usage examples. #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn split_mut<F>(&mut self, pred: F) -> SplitMut<'_, T, F> - where F: FnMut(&T) -> bool + where + F: FnMut(&T) -> bool, { - SplitMut { v: self, pred, finished: false } + ext::split(self, pred) } /// Returns an iterator over subslices separated by elements that match @@ -1080,32 +1131,56 @@ impl<T> [T] { /// # Examples /// /// ``` + /// #![feature(slice_needle_methods)] + /// /// let slice = [11, 22, 33, 0, 44, 55]; - /// let mut iter = slice.rsplit(|num| *num == 0); + /// let mut iter = slice.rsplit_match(&[0]); /// /// assert_eq!(iter.next().unwrap(), &[44, 55]); /// assert_eq!(iter.next().unwrap(), &[11, 22, 33]); /// assert_eq!(iter.next(), None); /// ``` /// - /// As with `split()`, if the first or last element is matched, an empty + /// As with `split_match()`, if the first or last element is matched, an empty /// slice will be the first (or last) item returned by the iterator. /// /// ``` + /// #![feature(slice_needle_methods)] + /// /// let v = &[0, 1, 1, 2, 3, 5, 8]; - /// let mut it = v.rsplit(|n| *n % 2 == 0); + /// let mut it = v.rsplit_match(|n: &i32| *n % 2 == 0); /// assert_eq!(it.next().unwrap(), &[]); /// assert_eq!(it.next().unwrap(), &[3, 5]); /// assert_eq!(it.next().unwrap(), &[1, 1]); /// assert_eq!(it.next().unwrap(), &[]); /// assert_eq!(it.next(), None); /// ``` + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn rsplit_match<'a, F>(&'a self, pred: F) -> ext::RSplit<&'a [T], F::Searcher> + where + F: Needle<&'a [T]>, + F::Searcher: ReverseSearcher<[T]>, + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::rsplit(self, pred) + } + + /// Returns an iterator over subslices separated by elements that match + /// `pred`, starting at the end of the slice and working backwards. + /// The matched element is not contained in the subslices. + /// + /// This method is the stable equivalent of + /// [`rsplit_match`](#method.rsplit_match), except it only accepts closures + /// instead of all kinds of needles. See documentations of `rsplit_match` + /// for usage examples. #[stable(feature = "slice_rsplit", since = "1.27.0")] #[inline] pub fn rsplit<F>(&self, pred: F) -> RSplit<'_, T, F> - where F: FnMut(&T) -> bool + where + F: FnMut(&T) -> bool, { - RSplit { inner: self.split(pred) } + ext::rsplit(self, pred) } /// Returns an iterator over mutable subslices separated by elements that @@ -1115,22 +1190,44 @@ impl<T> [T] { /// # Examples /// /// ``` + /// #![feature(slice_needle_methods)] + /// /// let mut v = [100, 400, 300, 200, 600, 500]; /// /// let mut count = 0; - /// for group in v.rsplit_mut(|num| *num % 3 == 0) { + /// for group in v.rsplit_match_mut(|num: &i32| *num % 3 == 0) { /// count += 1; /// group[0] = count; /// } /// assert_eq!(v, [3, 400, 300, 2, 600, 1]); /// ``` /// + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn rsplit_match_mut<'a, F>(&'a mut self, pred: F) -> ext::RSplit<&'a mut [T], F::Searcher> + where + F: Needle<&'a mut [T]>, + F::Searcher: ReverseSearcher<[T]>, + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::rsplit(self, pred) + } + + /// Returns an iterator over mutable subslices separated by elements that + /// match `pred`, starting at the end of the slice and working + /// backwards. The matched element is not contained in the subslices. + /// + /// This method is the stable equivalent of + /// [`rsplit_match_mut`](#method.rsplit_match_mut), except it only accepts + /// closures instead of all kinds of needles. See documentations of + /// `rsplit_match_mut` for usage examples. #[stable(feature = "slice_rsplit", since = "1.27.0")] #[inline] - pub fn rsplit_mut<F>(&mut self, pred: F) -> RSplitMut<'_, T, F> - where F: FnMut(&T) -> bool + pub fn rsplit_mut<F>(&mut self, pred: F) -> RSplit<'_, T, F> + where + F: FnMut(&T) -> bool, { - RSplitMut { inner: self.split_mut(pred) } + ext::rsplit(self, pred) } /// Returns an iterator over subslices separated by elements that match @@ -1146,23 +1243,40 @@ impl<T> [T] { /// `[20, 60, 50]`): /// /// ``` + /// #![feature(slice_needle_methods)] + /// /// let v = [10, 40, 30, 20, 60, 50]; /// - /// for group in v.splitn(2, |num| *num % 3 == 0) { + /// for group in v.splitn_match(2, |num: &i32| *num % 3 == 0) { /// println!("{:?}", group); /// } /// ``` + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn splitn_match<'a, F>(&'a self, n: usize, pred: F) -> ext::SplitN<&'a [T], F::Searcher> + where + F: Needle<&'a [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::splitn(self, n, pred) + } + + /// Returns an iterator over subslices separated by elements that match + /// `pred`, limited to returning at most `n` items. The matched element is + /// not contained in the subslices. + /// + /// This method is the stable equivalent of + /// [`splitn_match`](#method.splitn_match), except it only accepts closures + /// instead of all kinds of needles. See documentations of `splitn_match` + /// for usage examples. #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn splitn<F>(&self, n: usize, pred: F) -> SplitN<'_, T, F> - where F: FnMut(&T) -> bool + where + F: FnMut(&T) -> bool, { - SplitN { - inner: GenericSplitN { - iter: self.split(pred), - count: n - } - } + ext::splitn(self, n, pred) } /// Returns an iterator over subslices separated by elements that match @@ -1175,24 +1289,42 @@ impl<T> [T] { /// # Examples /// /// ``` + /// #![feature(slice_needle_methods)] + /// /// let mut v = [10, 40, 30, 20, 60, 50]; /// - /// for group in v.splitn_mut(2, |num| *num % 3 == 0) { + /// for group in v.splitn_match_mut(2, |num: &i32| *num % 3 == 0) { /// group[0] = 1; /// } /// assert_eq!(v, [1, 40, 30, 1, 60, 50]); /// ``` + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn splitn_match_mut<'a, F>(&'a mut self, n: usize, pred: F) + -> ext::SplitN<&'a mut [T], F::Searcher> + where + F: Needle<&'a mut [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::splitn(self, n, pred) + } + + /// Returns an iterator over subslices separated by elements that match + /// `pred`, limited to returning at most `n` items. The matched element is + /// not contained in the subslices. + /// + /// This method is the stable equivalent of + /// [`splitn_match_mut`](#method.splitn_match_mut), except it only accepts + /// closures instead of all kinds of needles. See documentations of + /// `splitn_match_mut` for usage examples. #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn splitn_mut<F>(&mut self, n: usize, pred: F) -> SplitNMut<'_, T, F> - where F: FnMut(&T) -> bool + where + F: FnMut(&T) -> bool, { - SplitNMut { - inner: GenericSplitN { - iter: self.split_mut(pred), - count: n - } - } + ext::splitn(self, n, pred) } /// Returns an iterator over subslices separated by elements that match @@ -1209,23 +1341,41 @@ impl<T> [T] { /// by 3 (i.e., `[50]`, `[10, 40, 30, 20]`): /// /// ``` + /// #![feature(slice_needle_methods)] + /// /// let v = [10, 40, 30, 20, 60, 50]; /// - /// for group in v.rsplitn(2, |num| *num % 3 == 0) { + /// for group in v.rsplitn_match(2, |num: &i32| *num % 3 == 0) { /// println!("{:?}", group); /// } /// ``` + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn rsplitn_match<'a, F>(&'a self, n: usize, pred: F) -> ext::RSplitN<&'a [T], F::Searcher> + where + F: Needle<&'a [T]>, + F::Searcher: ReverseSearcher<[T]>, + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::rsplitn(self, n, pred) + } + + /// Returns an iterator over subslices separated by elements that match + /// `pred` limited to returning at most `n` items. This starts at the end of + /// the slice and works backwards. The matched element is not contained in + /// the subslices. + /// + /// This method is the stable equivalent of + /// [`rsplitn_match`](#method.rsplitn_match), except it only accepts + /// closures instead of all kinds of needles. See documentations of + /// `rsplitn_match` for usage examples. #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn rsplitn<F>(&self, n: usize, pred: F) -> RSplitN<'_, T, F> - where F: FnMut(&T) -> bool + where + F: FnMut(&T) -> bool, { - RSplitN { - inner: GenericSplitN { - iter: self.rsplit(pred), - count: n - } - } + ext::rsplitn(self, n, pred) } /// Returns an iterator over subslices separated by elements that match @@ -1239,24 +1389,43 @@ impl<T> [T] { /// # Examples /// /// ``` + /// #![feature(slice_needle_methods)] + /// /// let mut s = [10, 40, 30, 20, 60, 50]; /// - /// for group in s.rsplitn_mut(2, |num| *num % 3 == 0) { + /// for group in s.rsplitn_match_mut(2, |num: &i32| *num % 3 == 0) { /// group[0] = 1; /// } /// assert_eq!(s, [1, 40, 30, 20, 60, 1]); /// ``` + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn rsplitn_match_mut<'a, F>(&'a mut self, n: usize, pred: F) + -> ext::RSplitN<&'a mut [T], F::Searcher> + where + F: Needle<&'a mut [T]>, + F::Searcher: ReverseSearcher<[T]>, + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::rsplitn(self, n, pred) + } + + /// Returns an iterator over subslices separated by elements that match + /// `pred` limited to returning at most `n` items. This starts at the end of + /// the slice and works backwards. The matched element is not contained in + /// the subslices. + /// + /// This method is the stable equivalent of + /// [`rsplitn_match_mut`](#method.rsplitn_match_mut), except it only accepts + /// closures instead of all kinds of needles. See documentations of + /// `rsplitn_match_mut` for usage examples. #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn rsplitn_mut<F>(&mut self, n: usize, pred: F) -> RSplitNMut<'_, T, F> - where F: FnMut(&T) -> bool + where + F: FnMut(&T) -> bool, { - RSplitNMut { - inner: GenericSplitN { - iter: self.rsplit_mut(pred), - count: n - } - } + ext::rsplitn(self, n, pred) } /// Returns `true` if the slice contains an element with the given value. @@ -1275,6 +1444,312 @@ impl<T> [T] { x.slice_contains(self) } + /// Returns `true` if the given predicate matches a sub-slice of this slice. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn contains_match<'a, F>(&'a self, pred: F) -> bool + where + F: Needle<&'a [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::contains(self, pred) + } + + /// Returns the index of the first sub-slice of this slice that matches the + /// predicate. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn find<'a, F>(&'a self, pred: F) -> Option<usize> + where + F: Needle<&'a [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::find(self, pred) + } + + /// Returns the index of the last sub-slice of this slice that matches the + /// predicate. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn rfind<'a, F>(&'a self, pred: F) -> Option<usize> + where + F: Needle<&'a [T]>, + F::Searcher: ReverseSearcher<[T]>, + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::rfind(self, pred) + } + + /// Returns the index range of the first sub-slice of this slice that + /// matches the predicate. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn find_range<'a, F>(&'a self, pred: F) -> Option<ops::Range<usize>> + where + F: Needle<&'a [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::find_range(self, pred) + } + + /// Returns the index range of the last sub-slice of this slice that matches + /// the predicate. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn rfind_range<'a, F>(&'a self, pred: F) -> Option<ops::Range<usize>> + where + F: Needle<&'a [T]>, + F::Searcher: ReverseSearcher<[T]>, + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::rfind_range(self, pred) + } + + /// An iterator over the disjoint matches of a predicate within the given + /// slice. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn matches<'a, F>(&'a self, pred: F) -> ext::Matches<&'a [T], F::Searcher> + where + F: Needle<&'a [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::matches(self, pred) + } + + /// An iterator over the disjoint matches of a predicate within the given + /// mutable slice. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn matches_mut<'a, F>(&'a mut self, pred: F) -> ext::Matches<&'a mut [T], F::Searcher> + where + F: Needle<&'a mut [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::matches(self, pred) + } + + /// An iterator over the disjoint matches of a predicate within the given + /// slice, yielded in reverse order. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn rmatches<'a, F>(&'a self, pred: F) -> ext::RMatches<&'a [T], F::Searcher> + where + F: Needle<&'a [T]>, + F::Searcher: ReverseSearcher<[T]>, + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::rmatches(self, pred) + } + + /// An iterator over the disjoint matches of a predicate within the given + /// mutable slice, yielded in reverse order. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn rmatches_mut<'a, F>(&'a mut self, pred: F) -> ext::RMatches<&'a mut [T], F::Searcher> + where + F: Needle<&'a mut [T]>, + F::Searcher: ReverseSearcher<[T]>, + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::rmatches(self, pred) + } + + /// An iterator over the disjoint matches of a predicate within the given + /// slice as well as the index that the match starts at. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn match_indices<'a, F>(&'a self, pred: F) + -> ext::MatchIndices<&'a [T], F::Searcher> + where + F: Needle<&'a [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::match_indices(self, pred) + } + + /// An iterator over the disjoint match_indices of a predicate within the given + /// mutable slice as well as the range that the match covers. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn match_indices_mut<'a, F>(&'a mut self, pred: F) + -> ext::MatchIndices<&'a mut [T], F::Searcher> + where + F: Needle<&'a mut [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::match_indices(self, pred) + } + + /// An iterator over the disjoint match_indices of a predicate within the given + /// slice as well as the index that the match starts at, yielded in reverse order. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn rmatch_indices<'a, F>(&'a self, pred: F) + -> ext::RMatchIndices<&'a [T], F::Searcher> + where + F: Needle<&'a [T]>, + F::Searcher: ReverseSearcher<[T]>, + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::rmatch_indices(self, pred) + } + + /// An iterator over the disjoint match_indices of a predicate within the given + /// mutable slice as well as the index that the match starts at, yielded in reverse order. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn rmatch_indices_mut<'a, F>(&'a mut self, pred: F) + -> ext::RMatchIndices<&'a mut [T], F::Searcher> + where + F: Needle<&'a mut [T]>, + F::Searcher: ReverseSearcher<[T]>, + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::rmatch_indices(self, pred) + } + + /// An iterator over the disjoint matches of a predicate within the given + /// slice as well as the range that the match covers. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn match_ranges<'a, F>(&'a self, pred: F) + -> ext::MatchRanges<&'a [T], F::Searcher> + where + F: Needle<&'a [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::match_ranges(self, pred) + } + + /// An iterator over the disjoint match_ranges of a predicate within the given + /// mutable slice as well as the range that the match coversat. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn match_ranges_mut<'a, F>(&'a mut self, pred: F) + -> ext::MatchRanges<&'a mut [T], F::Searcher> + where + F: Needle<&'a mut [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::match_ranges(self, pred) + } + + /// An iterator over the disjoint match_ranges of a predicate within the given + /// slice as well as the range that the match covers, yielded in reverse order. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn rmatch_ranges<'a, F>(&'a self, pred: F) + -> ext::RMatchRanges<&'a [T], F::Searcher> + where + F: Needle<&'a [T]>, + F::Searcher: ReverseSearcher<[T]>, + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::rmatch_ranges(self, pred) + } + + /// An iterator over the disjoint match_ranges of a predicate within the given + /// mutable slice as well as the range that the match covers, yielded in reverse order. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn rmatch_ranges_mut<'a, F>(&'a mut self, pred: F) + -> ext::RMatchRanges<&'a mut [T], F::Searcher> + where + F: Needle<&'a mut [T]>, + F::Searcher: ReverseSearcher<[T]>, + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::rmatch_ranges(self, pred) + } + + /// Returns a slice with all prefixes and suffixes that match a predicate + /// repeatedly removed. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn trim_matches<'a, F>(&'a self, pred: F) -> &'a [T] + where + F: Needle<&'a [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: DoubleEndedConsumer<[T]>, + { + ext::trim(self, pred) + } + + /// Returns a mutable slice with all prefixes and suffixes that match a + /// predicate repeatedly removed. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn trim_matches_mut<'a, F>(&'a mut self, pred: F) -> &'a mut [T] + where + F: Needle<&'a mut [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: DoubleEndedConsumer<[T]>, + { + ext::trim(self, pred) + } + + /// Returns a slice with all prefixes that match a predicate repeatedly + /// removed. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn trim_start_matches<'a, F>(&'a self, pred: F) -> &'a [T] + where + F: Needle<&'a [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::trim_start(self, pred) + } + + /// Returns a mutable slice with all prefixes that match a predicate + /// repeatedly removed. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn trim_start_matches_mut<'a, F>(&'a mut self, pred: F) -> &'a mut [T] + where + F: Needle<&'a mut [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 + { + ext::trim_start(self, pred) + } + + /// Returns a slice with all suffixes that match a predicate repeatedly + /// removed. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn trim_end_matches<'a, F>(&'a self, pred: F) -> &'a [T] + where + F: Needle<&'a [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: ReverseConsumer<[T]>, + { + ext::trim_end(self, pred) + } + + /// Returns a mutable slice with all suffixes that match a predicate + /// repeatedly removed. + #[unstable(feature = "slice_needle_methods", issue = "56345")] + #[inline] + pub fn trim_end_matches_mut<'a, F>(&'a mut self, pred: F) -> &'a mut [T] + where + F: Needle<&'a mut [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: ReverseConsumer<[T]>, + { + ext::trim_end(self, pred) + } + /// Returns `true` if `needle` is a prefix of the slice. /// /// # Examples @@ -1296,11 +1771,13 @@ impl<T> [T] { /// assert!(v.starts_with(&[])); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - pub fn starts_with(&self, needle: &[T]) -> bool - where T: PartialEq + pub fn starts_with<'a, F>(&'a self, needle: F) -> bool + where + F: Needle<&'a [T]>, + F::Searcher: Searcher<[T]>, // FIXME: RFC 2089 + F::Consumer: Consumer<[T]>, // FIXME: RFC 2089 { - let n = needle.len(); - self.len() >= n && needle == &self[..n] + ext::starts_with(self, needle) } /// Returns `true` if `needle` is a suffix of the slice. @@ -1324,11 +1801,13 @@ impl<T> [T] { /// assert!(v.ends_with(&[])); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - pub fn ends_with(&self, needle: &[T]) -> bool - where T: PartialEq + pub fn ends_with<'a, F>(&'a self, needle: F) -> bool + where + F: Needle<&'a [T]>, + F::Searcher: Searcher<[T]>, + F::Consumer: ReverseConsumer<[T]>, // FIXME: RFC 2089 { - let (m, n) = (self.len(), needle.len()); - m >= n && needle == &self[m-n..] + ext::ends_with(self, needle) } /// Binary searches this sorted slice for a given element. @@ -3468,463 +3947,101 @@ impl<'a, T> IterMut<'a, T> { iterator!{struct IterMut -> *mut T, &'a mut T, mut, {mut}, {}} -/// An internal abstraction over the splitting iterators, so that -/// splitn, splitn_mut etc can be implemented once. -#[doc(hidden)] -trait SplitIter: DoubleEndedIterator { - /// Marks the underlying iterator as complete, extracting the remaining - /// portion of the slice. - fn finish(&mut self) -> Option<Self::Item>; -} - -/// An iterator over subslices separated by elements that match a predicate -/// function. -/// -/// This struct is created by the [`split`] method on [slices]. -/// -/// [`split`]: ../../std/primitive.slice.html#method.split -/// [slices]: ../../std/primitive.slice.html -#[stable(feature = "rust1", since = "1.0.0")] -pub struct Split<'a, T:'a, P> where P: FnMut(&T) -> bool { - v: &'a [T], - pred: P, - finished: bool -} - -#[stable(feature = "core_impl_debug", since = "1.9.0")] -impl<T: fmt::Debug, P> fmt::Debug for Split<'_, T, P> where P: FnMut(&T) -> bool { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("Split") - .field("v", &self.v) - .field("finished", &self.finished) - .finish() - } -} - -// FIXME(#26925) Remove in favor of `#[derive(Clone)]` -#[stable(feature = "rust1", since = "1.0.0")] -impl<T, P> Clone for Split<'_, T, P> where P: Clone + FnMut(&T) -> bool { - fn clone(&self) -> Self { - Split { - v: self.v, - pred: self.pred.clone(), - finished: self.finished, - } - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a, T, P> Iterator for Split<'a, T, P> where P: FnMut(&T) -> bool { - type Item = &'a [T]; - - #[inline] - fn next(&mut self) -> Option<&'a [T]> { - if self.finished { return None; } - - match self.v.iter().position(|x| (self.pred)(x)) { - None => self.finish(), - Some(idx) => { - let ret = Some(&self.v[..idx]); - self.v = &self.v[idx + 1..]; - ret - } - } - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - if self.finished { - (0, Some(0)) - } else { - (1, Some(self.v.len() + 1)) - } - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a, T, P> DoubleEndedIterator for Split<'a, T, P> where P: FnMut(&T) -> bool { - #[inline] - fn next_back(&mut self) -> Option<&'a [T]> { - if self.finished { return None; } - - match self.v.iter().rposition(|x| (self.pred)(x)) { - None => self.finish(), - Some(idx) => { - let ret = Some(&self.v[idx + 1..]); - self.v = &self.v[..idx]; - ret - } - } - } -} - -impl<'a, T, P> SplitIter for Split<'a, T, P> where P: FnMut(&T) -> bool { - #[inline] - fn finish(&mut self) -> Option<&'a [T]> { - if self.finished { None } else { self.finished = true; Some(self.v) } - } -} - -#[stable(feature = "fused", since = "1.26.0")] -impl<T, P> FusedIterator for Split<'_, T, P> where P: FnMut(&T) -> bool {} - -/// An iterator over the subslices of the vector which are separated -/// by elements that match `pred`. -/// -/// This struct is created by the [`split_mut`] method on [slices]. -/// -/// [`split_mut`]: ../../std/primitive.slice.html#method.split_mut -/// [slices]: ../../std/primitive.slice.html -#[stable(feature = "rust1", since = "1.0.0")] -pub struct SplitMut<'a, T:'a, P> where P: FnMut(&T) -> bool { - v: &'a mut [T], - pred: P, - finished: bool -} - -#[stable(feature = "core_impl_debug", since = "1.9.0")] -impl<T: fmt::Debug, P> fmt::Debug for SplitMut<'_, T, P> where P: FnMut(&T) -> bool { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SplitMut") - .field("v", &self.v) - .field("finished", &self.finished) - .finish() - } -} - -impl<'a, T, P> SplitIter for SplitMut<'a, T, P> where P: FnMut(&T) -> bool { - #[inline] - fn finish(&mut self) -> Option<&'a mut [T]> { - if self.finished { - None - } else { - self.finished = true; - Some(mem::replace(&mut self.v, &mut [])) - } - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a, T, P> Iterator for SplitMut<'a, T, P> where P: FnMut(&T) -> bool { - type Item = &'a mut [T]; - - #[inline] - fn next(&mut self) -> Option<&'a mut [T]> { - if self.finished { return None; } - - let idx_opt = { // work around borrowck limitations - let pred = &mut self.pred; - self.v.iter().position(|x| (*pred)(x)) - }; - match idx_opt { - None => self.finish(), - Some(idx) => { - let tmp = mem::replace(&mut self.v, &mut []); - let (head, tail) = tmp.split_at_mut(idx); - self.v = &mut tail[1..]; - Some(head) - } - } - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - if self.finished { - (0, Some(0)) - } else { - // if the predicate doesn't match anything, we yield one slice - // if it matches every element, we yield len+1 empty slices. - (1, Some(self.v.len() + 1)) - } - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a, T, P> DoubleEndedIterator for SplitMut<'a, T, P> where - P: FnMut(&T) -> bool, -{ - #[inline] - fn next_back(&mut self) -> Option<&'a mut [T]> { - if self.finished { return None; } - - let idx_opt = { // work around borrowck limitations - let pred = &mut self.pred; - self.v.iter().rposition(|x| (*pred)(x)) - }; - match idx_opt { - None => self.finish(), - Some(idx) => { - let tmp = mem::replace(&mut self.v, &mut []); - let (head, tail) = tmp.split_at_mut(idx); - self.v = head; - Some(&mut tail[1..]) - } - } - } -} - -#[stable(feature = "fused", since = "1.26.0")] -impl<T, P> FusedIterator for SplitMut<'_, T, P> where P: FnMut(&T) -> bool {} - -/// An iterator over subslices separated by elements that match a predicate -/// function, starting from the end of the slice. -/// -/// This struct is created by the [`rsplit`] method on [slices]. -/// -/// [`rsplit`]: ../../std/primitive.slice.html#method.rsplit -/// [slices]: ../../std/primitive.slice.html -#[stable(feature = "slice_rsplit", since = "1.27.0")] -#[derive(Clone)] // Is this correct, or does it incorrectly require `T: Clone`? -pub struct RSplit<'a, T:'a, P> where P: FnMut(&T) -> bool { - inner: Split<'a, T, P> -} - -#[stable(feature = "slice_rsplit", since = "1.27.0")] -impl<T: fmt::Debug, P> fmt::Debug for RSplit<'_, T, P> where P: FnMut(&T) -> bool { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("RSplit") - .field("v", &self.inner.v) - .field("finished", &self.inner.finished) - .finish() - } -} - -#[stable(feature = "slice_rsplit", since = "1.27.0")] -impl<'a, T, P> Iterator for RSplit<'a, T, P> where P: FnMut(&T) -> bool { - type Item = &'a [T]; - - #[inline] - fn next(&mut self) -> Option<&'a [T]> { - self.inner.next_back() - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - self.inner.size_hint() - } -} - -#[stable(feature = "slice_rsplit", since = "1.27.0")] -impl<'a, T, P> DoubleEndedIterator for RSplit<'a, T, P> where P: FnMut(&T) -> bool { - #[inline] - fn next_back(&mut self) -> Option<&'a [T]> { - self.inner.next() - } -} - -#[stable(feature = "slice_rsplit", since = "1.27.0")] -impl<'a, T, P> SplitIter for RSplit<'a, T, P> where P: FnMut(&T) -> bool { - #[inline] - fn finish(&mut self) -> Option<&'a [T]> { - self.inner.finish() - } -} - -#[stable(feature = "slice_rsplit", since = "1.27.0")] -impl<T, P> FusedIterator for RSplit<'_, T, P> where P: FnMut(&T) -> bool {} - -/// An iterator over the subslices of the vector which are separated -/// by elements that match `pred`, starting from the end of the slice. -/// -/// This struct is created by the [`rsplit_mut`] method on [slices]. -/// -/// [`rsplit_mut`]: ../../std/primitive.slice.html#method.rsplit_mut -/// [slices]: ../../std/primitive.slice.html -#[stable(feature = "slice_rsplit", since = "1.27.0")] -pub struct RSplitMut<'a, T:'a, P> where P: FnMut(&T) -> bool { - inner: SplitMut<'a, T, P> -} - -#[stable(feature = "slice_rsplit", since = "1.27.0")] -impl<T: fmt::Debug, P> fmt::Debug for RSplitMut<'_, T, P> where P: FnMut(&T) -> bool { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("RSplitMut") - .field("v", &self.inner.v) - .field("finished", &self.inner.finished) - .finish() - } -} - -#[stable(feature = "slice_rsplit", since = "1.27.0")] -impl<'a, T, P> SplitIter for RSplitMut<'a, T, P> where P: FnMut(&T) -> bool { - #[inline] - fn finish(&mut self) -> Option<&'a mut [T]> { - self.inner.finish() - } -} - -#[stable(feature = "slice_rsplit", since = "1.27.0")] -impl<'a, T, P> Iterator for RSplitMut<'a, T, P> where P: FnMut(&T) -> bool { - type Item = &'a mut [T]; - - #[inline] - fn next(&mut self) -> Option<&'a mut [T]> { - self.inner.next_back() - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - self.inner.size_hint() - } -} - -#[stable(feature = "slice_rsplit", since = "1.27.0")] -impl<'a, T, P> DoubleEndedIterator for RSplitMut<'a, T, P> where - P: FnMut(&T) -> bool, -{ - #[inline] - fn next_back(&mut self) -> Option<&'a mut [T]> { - self.inner.next() - } -} - -#[stable(feature = "slice_rsplit", since = "1.27.0")] -impl<T, P> FusedIterator for RSplitMut<'_, T, P> where P: FnMut(&T) -> bool {} - -/// An private iterator over subslices separated by elements that -/// match a predicate function, splitting at most a fixed number of -/// times. -#[derive(Debug)] -struct GenericSplitN<I> { - iter: I, - count: usize, -} - -impl<T, I: SplitIter<Item=T>> Iterator for GenericSplitN<I> { - type Item = T; - - #[inline] - fn next(&mut self) -> Option<T> { - match self.count { - 0 => None, - 1 => { self.count -= 1; self.iter.finish() } - _ => { self.count -= 1; self.iter.next() } - } - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - let (lower, upper_opt) = self.iter.size_hint(); - (lower, upper_opt.map(|upper| cmp::min(self.count, upper))) - } -} - -/// An iterator over subslices separated by elements that match a predicate -/// function, limited to a given number of splits. -/// -/// This struct is created by the [`splitn`] method on [slices]. -/// -/// [`splitn`]: ../../std/primitive.slice.html#method.splitn -/// [slices]: ../../std/primitive.slice.html -#[stable(feature = "rust1", since = "1.0.0")] -pub struct SplitN<'a, T: 'a, P> where P: FnMut(&T) -> bool { - inner: GenericSplitN<Split<'a, T, P>> -} - -#[stable(feature = "core_impl_debug", since = "1.9.0")] -impl<T: fmt::Debug, P> fmt::Debug for SplitN<'_, T, P> where P: FnMut(&T) -> bool { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SplitN") - .field("inner", &self.inner) - .finish() - } -} - -/// An iterator over subslices separated by elements that match a -/// predicate function, limited to a given number of splits, starting -/// from the end of the slice. -/// -/// This struct is created by the [`rsplitn`] method on [slices]. -/// -/// [`rsplitn`]: ../../std/primitive.slice.html#method.rsplitn -/// [slices]: ../../std/primitive.slice.html -#[stable(feature = "rust1", since = "1.0.0")] -pub struct RSplitN<'a, T: 'a, P> where P: FnMut(&T) -> bool { - inner: GenericSplitN<RSplit<'a, T, P>> -} - -#[stable(feature = "core_impl_debug", since = "1.9.0")] -impl<T: fmt::Debug, P> fmt::Debug for RSplitN<'_, T, P> where P: FnMut(&T) -> bool { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("RSplitN") - .field("inner", &self.inner) - .finish() +macro_rules! forward_to_needle_api { + ($($(#[$meta:meta])* type $name:ident; $(#[$meta_mut:meta])* type $name_mut:ident;)+) => { + $( + $(#[$meta])* + pub type $name<'a, T, P> = ext::$name<&'a [T], self::needles::ElemSearcher<P>>; + $(#[$meta_mut])* + pub type $name_mut<'a, T, P> = ext::$name<&'a mut [T], self::needles::ElemSearcher<P>>; + )+ } } -/// An iterator over subslices separated by elements that match a predicate -/// function, limited to a given number of splits. -/// -/// This struct is created by the [`splitn_mut`] method on [slices]. -/// -/// [`splitn_mut`]: ../../std/primitive.slice.html#method.splitn_mut -/// [slices]: ../../std/primitive.slice.html -#[stable(feature = "rust1", since = "1.0.0")] -pub struct SplitNMut<'a, T: 'a, P> where P: FnMut(&T) -> bool { - inner: GenericSplitN<SplitMut<'a, T, P>> -} +forward_to_needle_api! { + /// An iterator over subslices separated by elements that match a predicate + /// function. + /// + /// This struct is created by the [`split`] method on [slices]. + /// + /// [`split`]: ../../std/primitive.slice.html#method.split + /// [slices]: ../../std/primitive.slice.html + #[stable(feature = "rust1", since = "1.0.0")] + type Split; -#[stable(feature = "core_impl_debug", since = "1.9.0")] -impl<T: fmt::Debug, P> fmt::Debug for SplitNMut<'_, T, P> where P: FnMut(&T) -> bool { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SplitNMut") - .field("inner", &self.inner) - .finish() - } -} + /// An iterator over the subslices of the vector which are separated + /// by elements that match `pred`. + /// + /// This struct is created by the [`split_mut`] method on [slices]. + /// + /// [`split_mut`]: ../../std/primitive.slice.html#method.split_mut + /// [slices]: ../../std/primitive.slice.html + #[stable(feature = "rust1", since = "1.0.0")] + type SplitMut; -/// An iterator over subslices separated by elements that match a -/// predicate function, limited to a given number of splits, starting -/// from the end of the slice. -/// -/// This struct is created by the [`rsplitn_mut`] method on [slices]. -/// -/// [`rsplitn_mut`]: ../../std/primitive.slice.html#method.rsplitn_mut -/// [slices]: ../../std/primitive.slice.html -#[stable(feature = "rust1", since = "1.0.0")] -pub struct RSplitNMut<'a, T: 'a, P> where P: FnMut(&T) -> bool { - inner: GenericSplitN<RSplitMut<'a, T, P>> -} + /// An iterator over subslices separated by elements that match a predicate + /// function, starting from the end of the slice. + /// + /// This struct is created by the [`rsplit`] method on [slices]. + /// + /// [`rsplit`]: ../../std/primitive.slice.html#method.rsplit + /// [slices]: ../../std/primitive.slice.html + #[stable(feature = "slice_rsplit", since = "1.27.0")] + type RSplit; -#[stable(feature = "core_impl_debug", since = "1.9.0")] -impl<T: fmt::Debug, P> fmt::Debug for RSplitNMut<'_, T, P> where P: FnMut(&T) -> bool { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("RSplitNMut") - .field("inner", &self.inner) - .finish() - } -} + /// An iterator over the subslices of the vector which are separated + /// by elements that match `pred`, starting from the end of the slice. + /// + /// This struct is created by the [`rsplit_mut`] method on [slices]. + /// + /// [`rsplit_mut`]: ../../std/primitive.slice.html#method.rsplit_mut + /// [slices]: ../../std/primitive.slice.html + #[stable(feature = "slice_rsplit", since = "1.27.0")] + type RSplitMut; -macro_rules! forward_iterator { - ($name:ident: $elem:ident, $iter_of:ty) => { - #[stable(feature = "rust1", since = "1.0.0")] - impl<'a, $elem, P> Iterator for $name<'a, $elem, P> where - P: FnMut(&T) -> bool - { - type Item = $iter_of; + /// An iterator over subslices separated by elements that match a predicate + /// function, limited to a given number of splits. + /// + /// This struct is created by the [`splitn`] method on [slices]. + /// + /// [`splitn`]: ../../std/primitive.slice.html#method.splitn + /// [slices]: ../../std/primitive.slice.html + #[stable(feature = "rust1", since = "1.0.0")] + type SplitN; - #[inline] - fn next(&mut self) -> Option<$iter_of> { - self.inner.next() - } + /// An iterator over subslices separated by elements that match a predicate + /// function, limited to a given number of splits. + /// + /// This struct is created by the [`splitn_mut`] method on [slices]. + /// + /// [`splitn_mut`]: ../../std/primitive.slice.html#method.splitn_mut + /// [slices]: ../../std/primitive.slice.html + #[stable(feature = "rust1", since = "1.0.0")] + type SplitNMut; - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - self.inner.size_hint() - } - } + /// An iterator over subslices separated by elements that match a + /// predicate function, limited to a given number of splits, starting + /// from the end of the slice. + /// + /// This struct is created by the [`rsplitn`] method on [slices]. + /// + /// [`rsplitn`]: ../../std/primitive.slice.html#method.rsplitn + /// [slices]: ../../std/primitive.slice.html + #[stable(feature = "rust1", since = "1.0.0")] + type RSplitN; - #[stable(feature = "fused", since = "1.26.0")] - impl<'a, $elem, P> FusedIterator for $name<'a, $elem, P> - where P: FnMut(&T) -> bool {} - } + /// An iterator over subslices separated by elements that match a + /// predicate function, limited to a given number of splits, starting + /// from the end of the slice. + /// + /// This struct is created by the [`rsplitn_mut`] method on [slices]. + /// + /// [`rsplitn_mut`]: ../../std/primitive.slice.html#method.rsplitn_mut + /// [slices]: ../../std/primitive.slice.html + #[stable(feature = "rust1", since = "1.0.0")] + type RSplitNMut; } -forward_iterator! { SplitN: T, &'a [T] } -forward_iterator! { RSplitN: T, &'a [T] } -forward_iterator! { SplitNMut: T, &'a mut [T] } -forward_iterator! { RSplitNMut: T, &'a mut [T] } - /// An iterator over overlapping subslices of length `size`. /// /// This struct is created by the [`windows`] method on [slices]. diff --git a/src/libcore/slice/needles.rs b/src/libcore/slice/needles.rs new file mode 100644 index 0000000000000..f9bd79080ade2 --- /dev/null +++ b/src/libcore/slice/needles.rs @@ -0,0 +1,737 @@ +use crate::needle::*; +use crate::ops::Range; +use crate::cmp::{Ordering, max, min}; +use crate::usize; +use crate::fmt; + +//------------------------------------------------------------------------------ +// Element searcher +//------------------------------------------------------------------------------ + +#[derive(Clone)] +pub struct ElemSearcher<F> { + predicate: F, +} + +// we need to impl Debug for everything due to stability guarantee. +impl<F> fmt::Debug for ElemSearcher<F> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ElemSearcher").finish() + } +} + +macro_rules! impl_needle_with_elem_searcher { + (<[$($gen:tt)*]> $ty:ty) => { + impl<$($gen)*> Needle<$ty> for F + where + F: FnMut(&T) -> bool, + { + type Searcher = ElemSearcher<F>; + type Consumer = ElemSearcher<F>; + + #[inline] + fn into_searcher(self) -> Self::Searcher { + ElemSearcher { + predicate: self, + } + } + + #[inline] + fn into_consumer(self) -> Self::Consumer { + ElemSearcher { + predicate: self, + } + } + } + } +} + +impl_needle_with_elem_searcher!(<['h, T, F]> &'h [T]); +impl_needle_with_elem_searcher!(<['h, T, F]> &'h mut [T]); + +unsafe impl<T, F> Searcher<[T]> for ElemSearcher<F> +where + F: FnMut(&T) -> bool, +{ + #[inline] + fn search(&mut self, span: Span<&[T]>) -> Option<Range<usize>> { + let (rest, range) = span.into_parts(); + let start = range.start; + let pos = rest[range].iter().position(&mut self.predicate)?; + Some((pos + start)..(pos + start + 1)) + } +} + +unsafe impl<T, F> Consumer<[T]> for ElemSearcher<F> +where + F: FnMut(&T) -> bool, +{ + #[inline] + fn consume(&mut self, span: Span<&[T]>) -> Option<usize> { + let (hay, range) = span.into_parts(); + if range.end == range.start { + return None; + } + let x = unsafe { hay.get_unchecked(range.start) }; + if (self.predicate)(x) { + Some(range.start + 1) + } else { + None + } + } + + #[inline] + fn trim_start(&mut self, hay: &[T]) -> usize { + let mut it = hay.iter(); + let len = hay.len(); + if it.find(|x| !(self.predicate)(x)).is_some() { + len - it.as_slice().len() - 1 + } else { + len + } + } +} + +unsafe impl<T, F> ReverseSearcher<[T]> for ElemSearcher<F> +where + F: FnMut(&T) -> bool, +{ + #[inline] + fn rsearch(&mut self, span: Span<&[T]>) -> Option<Range<usize>> { + let (rest, range) = span.into_parts(); + let start = range.start; + let pos = rest[range].iter().rposition(&mut self.predicate)?; + Some((pos + start)..(pos + start + 1)) + } +} + +unsafe impl<T, F> ReverseConsumer<[T]> for ElemSearcher<F> +where + F: FnMut(&T) -> bool, +{ + #[inline] + fn rconsume(&mut self, span: Span<&[T]>) -> Option<usize> { + let (hay, range) = span.into_parts(); + if range.start == range.end { + return None; + } + let last = range.end - 1; + let x = unsafe { hay.get_unchecked(last) }; + if (self.predicate)(x) { + Some(last) + } else { + None + } + } + + #[inline] + fn trim_end(&mut self, hay: &[T]) -> usize { + hay.iter().rposition(|x| !(self.predicate)(x)).map_or(0, |p| p + 1) + } +} + +unsafe impl<T, F> DoubleEndedSearcher<[T]> for ElemSearcher<F> +where + F: FnMut(&T) -> bool, +{} + +unsafe impl<T, F> DoubleEndedConsumer<[T]> for ElemSearcher<F> +where + F: FnMut(&T) -> bool, +{} + +//------------------------------------------------------------------------------ +// Two way searcher helpers +//------------------------------------------------------------------------------ + +type FastSkipByteset = u64; + +trait FastSkipOptimization { + fn byteset_mask(&self) -> FastSkipByteset; +} + +impl<T: ?Sized> FastSkipOptimization for T { + #[inline] + default fn byteset_mask(&self) -> FastSkipByteset { !0 } +} + +impl FastSkipOptimization for u8 { + #[inline] + fn byteset_mask(&self) -> FastSkipByteset { 1 << (self & 63) } +} + +trait MaximalSuffix: Sized { + // Compute the maximal suffix of `&[T]`. + // + // The maximal suffix is a possible critical factorization (u, v) of `arr`. + // + // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the + // period of v. + // + // `order` determines if lexical order is `<` or `>`. Both + // orders must be computed -- the ordering with the largest `i` gives + // a critical factorization. + // + // For long period cases, the resulting period is not exact (it is too short). + fn maximal_suffix(arr: &[Self], order: Ordering) -> (usize, usize); + + // Compute the maximal suffix of the reverse of `arr`. + // + // The maximal suffix is a possible critical factorization (u', v') of `arr`. + // + // Returns `i` where `i` is the starting index of v', from the back; + // returns immediately when a period of `known_period` is reached. + // + // `order_greater` determines if lexical order is `<` or `>`. Both + // orders must be computed -- the ordering with the largest `i` gives + // a critical factorization. + // + // For long period cases, the resulting period is not exact (it is too short). + fn reverse_maximal_suffix(arr: &[Self], known_period: usize, order: Ordering) -> usize; +} + +// fallback to naive search for non-Ord slices. +impl<T: PartialEq> MaximalSuffix for T { + default fn maximal_suffix(_: &[Self], _: Ordering) -> (usize, usize) { + (0, 1) + } + + default fn reverse_maximal_suffix(_: &[Self], _: usize, _: Ordering) -> usize { + 0 + } +} + +impl<T: Ord> MaximalSuffix for T { + fn maximal_suffix(arr: &[Self], order: Ordering) -> (usize, usize) { + let mut left = 0; // Corresponds to i in the paper + let mut right = 1; // Corresponds to j in the paper + let mut offset = 0; // Corresponds to k in the paper, but starting at 0 + // to match 0-based indexing. + let mut period = 1; // Corresponds to p in the paper + + while let Some(a) = arr.get(right + offset) { + // `left` will be inbounds when `right` is. + let b = &arr[left + offset]; + match a.cmp(b) { + Ordering::Equal => { + // Advance through repetition of the current period. + if offset + 1 == period { + right += offset + 1; + offset = 0; + } else { + offset += 1; + } + } + o if o == order => { + // Suffix is smaller, period is entire prefix so far. + right += offset + 1; + offset = 0; + period = right - left; + } + _ => { + // Suffix is larger, start over from current location. + left = right; + right += 1; + offset = 0; + period = 1; + } + }; + } + (left, period) + } + + fn reverse_maximal_suffix(arr: &[Self], known_period: usize, order: Ordering) -> usize { + let mut left = 0; // Corresponds to i in the paper + let mut right = 1; // Corresponds to j in the paper + let mut offset = 0; // Corresponds to k in the paper, but starting at 0 + // to match 0-based indexing. + let mut period = 1; // Corresponds to p in the paper + let n = arr.len(); + + while right + offset < n { + let a = &arr[n - (1 + right + offset)]; + let b = &arr[n - (1 + left + offset)]; + match a.cmp(b) { + Ordering::Equal => { + // Advance through repetition of the current period. + if offset + 1 == period { + right += offset + 1; + offset = 0; + } else { + offset += 1; + } + } + o if o == order => { + // Suffix is smaller, period is entire prefix so far. + right += offset + 1; + offset = 0; + period = right - left; + } + _ => { + // Suffix is larger, start over from current location. + left = right; + right += 1; + offset = 0; + period = 1; + } + } + if period == known_period { + break; + } + } + debug_assert!(period <= known_period); + left + } +} + +//------------------------------------------------------------------------------ +// Two way searcher +//------------------------------------------------------------------------------ + +struct LongPeriod; +struct ShortPeriod; + +trait Period { + const IS_LONG_PERIOD: bool; +} +impl Period for LongPeriod { + const IS_LONG_PERIOD: bool = true; +} +impl Period for ShortPeriod { + const IS_LONG_PERIOD: bool = false; +} + +/// A slice searcher based on Two-Way algorithm. +#[derive(Debug)] +pub struct TwoWaySearcher<'p, T: 'p> { + // constants + /// critical factorization index + crit_pos: usize, + /// critical factorization index for reversed needle + crit_pos_back: usize, + + period: usize, + + /// `byteset` is an extension (not part of the two way algorithm); + /// it's a 64-bit "fingerprint" where each set bit `j` corresponds + /// to a (byte & 63) == j present in the needle. + byteset: FastSkipByteset, + + needle: &'p [T], + + // variables + /// index into needle before which we have already matched + memory: usize, + /// index into needle after which we have already matched + memory_back: usize, +} + +impl<'p, T: 'p> Clone for TwoWaySearcher<'p, T> { + fn clone(&self) -> Self { + *self + } +} + +impl<'p, T: 'p> Copy for TwoWaySearcher<'p, T> {} + +impl<'p, T> TwoWaySearcher<'p, T> +where + T: PartialEq + 'p, +{ + #[inline] + fn do_next<P: Period>(&mut self, hay: &[T], range: Range<usize>) -> Option<Range<usize>> { + let needle = self.needle; + + let mut position = range.start; + 'search: loop { + // Check that we have room to search in + // position + needle_last can not overflow if we assume slices + // are bounded by isize's range. + let i = position + (needle.len() - 1); + if i >= range.end { + return None; + } + // let tail_item = &hay[i]; // using get_unchecked here would be slower + let tail_item = unsafe { hay.get_unchecked(i) }; + + // Quickly skip by large portions unrelated to our substring + if !self.byteset_contains(tail_item) { + position += needle.len(); + if !P::IS_LONG_PERIOD { + self.memory = 0; + } + continue 'search; + } + + // See if the right part of the needle matches + let start = if P::IS_LONG_PERIOD { + self.crit_pos + } else { + max(self.crit_pos, self.memory) + }; + for i in start..needle.len() { + if unsafe { needle.get_unchecked(i) != hay.get_unchecked(position + i) } { + position += i - self.crit_pos + 1; + if !P::IS_LONG_PERIOD { + self.memory = 0; + } + continue 'search; + } + } + + // See if the left part of the needle matches + let start = if P::IS_LONG_PERIOD { 0 } else { self.memory }; + for i in (start..self.crit_pos).rev() { + if unsafe { needle.get_unchecked(i) != hay.get_unchecked(position + i) } { + position += self.period; + if !P::IS_LONG_PERIOD { + self.memory = needle.len() - self.period; + } + continue 'search; + } + } + + // We have found a match! + // Note: add self.period instead of needle.len() to have overlapping matches + if !P::IS_LONG_PERIOD { + self.memory = 0; // set to needle.len() - self.period for overlapping matches + } + return Some(position..(position + needle.len())); + } + } + + #[inline] + pub(crate) fn next(&mut self, hay: &[T], range: Range<usize>) -> Option<Range<usize>> { + if self.memory != usize::MAX { + self.do_next::<ShortPeriod>(hay, range) + } else { + self.do_next::<LongPeriod>(hay, range) + } + } + + #[inline] + fn do_next_back<P: Period>(&mut self, hay: &[T], range: Range<usize>) -> Option<Range<usize>> { + let needle = self.needle; + let mut end = range.end; + 'search: loop { + // Check that we have room to search in + // end - needle.len() will wrap around when there is no more room, + // but due to slice length limits it can never wrap all the way back + // into the length of hay. + if needle.len() + range.start > end { + return None; + } + let front_item = unsafe { hay.get_unchecked(end.wrapping_sub(needle.len())) }; + + // Quickly skip by large portions unrelated to our substring + if !self.byteset_contains(front_item) { + end -= needle.len(); + if !P::IS_LONG_PERIOD { + self.memory_back = needle.len(); + } + continue 'search; + } + + // See if the left part of the needle matches + let crit = if P::IS_LONG_PERIOD { + self.crit_pos_back + } else { + min(self.crit_pos_back, self.memory_back) + }; + for i in (0..crit).rev() { + if unsafe { needle.get_unchecked(i) != hay.get_unchecked(end - needle.len() + i) } { + end -= self.crit_pos_back - i; + if !P::IS_LONG_PERIOD { + self.memory_back = needle.len(); + } + continue 'search; + } + } + + // See if the right part of the needle matches + let needle_end = if P::IS_LONG_PERIOD { needle.len() } else { self.memory_back }; + for i in self.crit_pos_back..needle_end { + if unsafe { needle.get_unchecked(i) != hay.get_unchecked(end - needle.len() + i) } { + end -= self.period; + if !P::IS_LONG_PERIOD { + self.memory_back = self.period; + } + continue 'search; + } + } + + // We have found a match! + if !P::IS_LONG_PERIOD { + self.memory_back = needle.len(); + } + return Some((end - needle.len())..end); + } + } + + #[inline] + pub(crate) fn next_back(&mut self, hay: &[T], range: Range<usize>) -> Option<Range<usize>> { + if self.memory != usize::MAX { + self.do_next_back::<ShortPeriod>(hay, range) + } else { + self.do_next_back::<LongPeriod>(hay, range) + } + } + + #[inline] + pub fn new(needle: &'p [T]) -> Self { + let res_lt = T::maximal_suffix(needle, Ordering::Less); + let res_gt = T::maximal_suffix(needle, Ordering::Greater); + let (crit_pos, period) = max(res_lt, res_gt); + + let byteset = Self::byteset_create(needle); + + // A particularly readable explanation of what's going on here can be found + // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically + // see the code for "Algorithm CP" on p. 323. + // + // What's going on is we have some critical factorization (u, v) of the + // needle, and we want to determine whether u is a suffix of + // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use + // "Algorithm CP2", which is optimized for when the period of the needle + // is large. + if needle[..crit_pos] == needle[period..(period + crit_pos)] { + // short period case -- the period is exact + // compute a separate critical factorization for the reversed needle + // x = u' v' where |v'| < period(x). + // + // This is sped up by the period being known already. + // Note that a case like x = "acba" may be factored exactly forwards + // (crit_pos = 1, period = 3) while being factored with approximate + // period in reverse (crit_pos = 2, period = 2). We use the given + // reverse factorization but keep the exact period. + let crit_pos_back = needle.len() - max( + T::reverse_maximal_suffix(needle, period, Ordering::Greater), + T::reverse_maximal_suffix(needle, period, Ordering::Less), + ); + + Self { + crit_pos, + crit_pos_back, + period, + byteset, + needle, + memory: 0, + memory_back: needle.len(), + } + } else { + Self { + crit_pos, + crit_pos_back: crit_pos, + period: max(crit_pos, needle.len() - crit_pos) + 1, + byteset, + needle, + memory: usize::MAX, // Dummy value to signify that the period is long + memory_back: usize::MAX, + } + } + } + + #[inline] + fn byteset_create(needle: &[T]) -> FastSkipByteset { + needle.iter().fold(0, |a, b| b.byteset_mask() | a) + } + #[inline] + fn byteset_contains(&self, item: &T) -> bool { + (self.byteset & item.byteset_mask()) != 0 + } +} + +unsafe impl<'p, T> Searcher<[T]> for TwoWaySearcher<'p, T> +where + T: PartialEq + 'p, +{ + #[inline] + fn search(&mut self, span: Span<&[T]>) -> Option<Range<usize>> { + let (hay, range) = span.into_parts(); + self.next(hay, range) + } +} + +unsafe impl<'p, T> ReverseSearcher<[T]> for TwoWaySearcher<'p, T> +where + T: PartialEq + 'p, +{ + #[inline] + fn rsearch(&mut self, span: Span<&[T]>) -> Option<Range<usize>> { + let (hay, range) = span.into_parts(); + self.next_back(hay, range) + } +} + +//------------------------------------------------------------------------------ +// Naive (state-less) searcher +//------------------------------------------------------------------------------ + +#[derive(Debug)] +pub struct NaiveSearcher<'p, T: 'p>(&'p [T]); + +impl<'p, T: 'p> Clone for NaiveSearcher<'p, T> { + fn clone(&self) -> Self { + *self + } +} + +impl<'p, T: 'p> Copy for NaiveSearcher<'p, T> {} + +unsafe impl<'p, T> Consumer<[T]> for NaiveSearcher<'p, T> +where + T: PartialEq + 'p, +{ + #[inline] + fn consume(&mut self, span: Span<&[T]>) -> Option<usize> { + let (hay, range) = span.into_parts(); + let check_end = range.start + self.0.len(); + if range.end < check_end { + return None; + } + if unsafe { hay.get_unchecked(range.start..check_end) } == self.0 { + Some(check_end) + } else { + None + } + } +} + +unsafe impl<'p, T> ReverseConsumer<[T]> for NaiveSearcher<'p, T> +where + T: PartialEq + 'p, +{ + #[inline] + fn rconsume(&mut self, span: Span<&[T]>) -> Option<usize> { + let (hay, range) = span.into_parts(); + if range.start + self.0.len() > range.end { + return None; + } + let index = range.end - self.0.len(); + if unsafe { hay.get_unchecked(index..range.end) } == self.0 { + Some(index) + } else { + None + } + } +} + +impl<'p, T: 'p> NaiveSearcher<'p, T> { + #[inline] + pub fn new(slice: &'p [T]) -> Self { + NaiveSearcher(slice) + } + + #[inline] + pub fn needle(&self) -> &'p [T] { + self.0 + } +} + +//------------------------------------------------------------------------------ +// Slice searcher +//------------------------------------------------------------------------------ + +#[derive(Debug)] +pub enum SliceSearcher<'p, T: 'p> { + TwoWay(TwoWaySearcher<'p, T>), + Empty(EmptySearcher), +} + +impl<'p, T: PartialEq + 'p> SliceSearcher<'p, T> { + #[inline] + pub fn new(slice: &'p [T]) -> Self { + if slice.is_empty() { + SliceSearcher::Empty(EmptySearcher::default()) + } else { + SliceSearcher::TwoWay(TwoWaySearcher::new(slice)) + } + } + + #[inline] + pub fn needle(&self) -> &'p [T] { + match self { + SliceSearcher::TwoWay(s) => s.needle, + SliceSearcher::Empty(_) => &[], + } + } +} + +impl<'p, T: 'p> Clone for SliceSearcher<'p, T> { + #[inline] + fn clone(&self) -> Self { + match self { + SliceSearcher::TwoWay(s) => SliceSearcher::TwoWay(*s), + SliceSearcher::Empty(s) => SliceSearcher::Empty(s.clone()), + } + } +} + +macro_rules! forward { + (searcher: $self:expr, $s:ident => $e:expr) => { + match $self { + SliceSearcher::TwoWay($s) => $e, + SliceSearcher::Empty($s) => $e, + } + }; +} + +unsafe impl<'p, T, A> Searcher<A> for SliceSearcher<'p, T> +where + A: Hay<Index = usize> + ?Sized, + TwoWaySearcher<'p, T>: Searcher<A>, +{ + #[inline] + fn search(&mut self, span: Span<&A>) -> Option<Range<usize>> { + forward!(searcher: self, s => s.search(span)) + } +} + +unsafe impl<'p, T, A> ReverseSearcher<A> for SliceSearcher<'p, T> +where + A: Hay<Index = usize> + ?Sized, + TwoWaySearcher<'p, T>: ReverseSearcher<A>, +{ + #[inline] + fn rsearch(&mut self, span: Span<&A>) -> Option<Range<usize>> { + forward!(searcher: self, s => s.rsearch(span)) + } +} + +macro_rules! impl_needle_for_slice_searcher { + ([$($gen:tt)+] <$haystack:ty> for $ty:ty) => { + impl<$($gen)+, 'h, T> Needle<$haystack> for $ty + where + T: PartialEq + 'p, + { + type Searcher = SliceSearcher<'p, T>; + type Consumer = NaiveSearcher<'p, T>; + + #[inline] + fn into_searcher(self) -> Self::Searcher { + SliceSearcher::new(self) + } + + #[inline] + fn into_consumer(self) -> Self::Consumer { + NaiveSearcher::new(self) + } + } + }; + + ($($index:expr),*) => { + impl_needle_for_slice_searcher!(['p] <&'h [T]> for &'p [T]); + impl_needle_for_slice_searcher!(['p] <&'h mut [T]> for &'p [T]); + impl_needle_for_slice_searcher!(['q, 'p] <&'h [T]> for &'q &'p [T]); + impl_needle_for_slice_searcher!(['q, 'p] <&'h mut [T]> for &'q &'p [T]); + $( + impl_needle_for_slice_searcher!(['p] <&'h [T]> for &'p [T; $index]); + impl_needle_for_slice_searcher!(['p] <&'h mut [T]> for &'p [T; $index]); + )* + } +} + +impl_needle_for_slice_searcher!( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 +); diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index 379c263c04ca6..d607d1849b295 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -4,19 +4,20 @@ #![stable(feature = "rust1", since = "1.0.0")] -use self::pattern::Pattern; -use self::pattern::{Searcher, ReverseSearcher, DoubleEndedSearcher}; - use crate::char; use crate::fmt::{self, Write}; use crate::iter::{Map, Cloned, FusedIterator, TrustedLen, TrustedRandomAccess, Filter}; use crate::iter::{Flatten, FlatMap, Chain}; use crate::slice::{self, SliceIndex, Split as SliceSplit}; use crate::mem; -use crate::ops::Try; +use crate::needle::{ + ext, Needle, Searcher, ReverseSearcher, Consumer, ReverseConsumer, DoubleEndedConsumer, +}; +use crate::ops::{Range, Try}; use crate::option; -pub mod pattern; +#[unstable(feature = "str_internals", issue = "0")] +mod needles; #[unstable(feature = "str_internals", issue = "0")] #[allow(missing_docs)] @@ -835,476 +836,47 @@ unsafe impl TrustedRandomAccess for Bytes<'_> { fn may_have_side_effect() -> bool { false } } -/// This macro generates a Clone impl for string pattern API -/// wrapper types of the form X<'a, P> -macro_rules! derive_pattern_clone { - (clone $t:ident with |$s:ident| $e:expr) => { - impl<'a, P: Pattern<'a>> Clone for $t<'a, P> - where P::Searcher: Clone - { - fn clone(&self) -> Self { - let $s = self; - $e - } - } +macro_rules! forward_to_needle_api { + ($(#[$link:meta] type $name:ident;)+) => { + $( + /// Created with the method + #[$link] + #[stable(feature = "rust1", since = "1.0.0")] + pub type $name<'a, P> = ext::$name<&'a str, <P as Needle<&'a str>>::Searcher>; + )+ } } -/// This macro generates two public iterator structs -/// wrapping a private internal one that makes use of the `Pattern` API. -/// -/// For all patterns `P: Pattern<'a>` the following items will be -/// generated (generics omitted): -/// -/// struct $forward_iterator($internal_iterator); -/// struct $reverse_iterator($internal_iterator); -/// -/// impl Iterator for $forward_iterator -/// { /* internal ends up calling Searcher::next_match() */ } -/// -/// impl DoubleEndedIterator for $forward_iterator -/// where P::Searcher: DoubleEndedSearcher -/// { /* internal ends up calling Searcher::next_match_back() */ } -/// -/// impl Iterator for $reverse_iterator -/// where P::Searcher: ReverseSearcher -/// { /* internal ends up calling Searcher::next_match_back() */ } -/// -/// impl DoubleEndedIterator for $reverse_iterator -/// where P::Searcher: DoubleEndedSearcher -/// { /* internal ends up calling Searcher::next_match() */ } -/// -/// The internal one is defined outside the macro, and has almost the same -/// semantic as a DoubleEndedIterator by delegating to `pattern::Searcher` and -/// `pattern::ReverseSearcher` for both forward and reverse iteration. -/// -/// "Almost", because a `Searcher` and a `ReverseSearcher` for a given -/// `Pattern` might not return the same elements, so actually implementing -/// `DoubleEndedIterator` for it would be incorrect. -/// (See the docs in `str::pattern` for more details) -/// -/// However, the internal struct still represents a single ended iterator from -/// either end, and depending on pattern is also a valid double ended iterator, -/// so the two wrapper structs implement `Iterator` -/// and `DoubleEndedIterator` depending on the concrete pattern type, leading -/// to the complex impls seen above. -macro_rules! generate_pattern_iterators { - { - // Forward iterator - forward: - $(#[$forward_iterator_attribute:meta])* - struct $forward_iterator:ident; - - // Reverse iterator - reverse: - $(#[$reverse_iterator_attribute:meta])* - struct $reverse_iterator:ident; - - // Stability of all generated items - stability: - $(#[$common_stability_attribute:meta])* - - // Internal almost-iterator that is being delegated to - internal: - $internal_iterator:ident yielding ($iterty:ty); - - // Kind of delegation - either single ended or double ended - delegate $($t:tt)* - } => { - $(#[$forward_iterator_attribute])* - $(#[$common_stability_attribute])* - pub struct $forward_iterator<'a, P: Pattern<'a>>($internal_iterator<'a, P>); - - $(#[$common_stability_attribute])* - impl<'a, P: Pattern<'a>> fmt::Debug for $forward_iterator<'a, P> - where P::Searcher: fmt::Debug - { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_tuple(stringify!($forward_iterator)) - .field(&self.0) - .finish() - } - } +forward_to_needle_api! { + /// [`split`](../../std/primitive.str.html#method.split) + type Split; - $(#[$common_stability_attribute])* - impl<'a, P: Pattern<'a>> Iterator for $forward_iterator<'a, P> { - type Item = $iterty; + /// [`rsplit`](../../std/primitive.str.html#method.rsplit) + type RSplit; - #[inline] - fn next(&mut self) -> Option<$iterty> { - self.0.next() - } - } + /// [`split_terminator`](../../std/primitive.str.html#method.split_terminator) + type SplitTerminator; - $(#[$common_stability_attribute])* - impl<'a, P: Pattern<'a>> Clone for $forward_iterator<'a, P> - where P::Searcher: Clone - { - fn clone(&self) -> Self { - $forward_iterator(self.0.clone()) - } - } + /// [`rsplit_terminator`](../../std/primitive.str.html#method.rsplit_terminator) + type RSplitTerminator; - $(#[$reverse_iterator_attribute])* - $(#[$common_stability_attribute])* - pub struct $reverse_iterator<'a, P: Pattern<'a>>($internal_iterator<'a, P>); + /// [`splitn`](../../std/primitive.str.html#method.splitn) + type SplitN; - $(#[$common_stability_attribute])* - impl<'a, P: Pattern<'a>> fmt::Debug for $reverse_iterator<'a, P> - where P::Searcher: fmt::Debug - { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_tuple(stringify!($reverse_iterator)) - .field(&self.0) - .finish() - } - } + /// [`rsplitn`](../../std/primitive.str.html#method.rsplitn) + type RSplitN; - $(#[$common_stability_attribute])* - impl<'a, P: Pattern<'a>> Iterator for $reverse_iterator<'a, P> - where P::Searcher: ReverseSearcher<'a> - { - type Item = $iterty; + /// [`match_indices`](../../std/primitive.str.html#method.match_indices) + type MatchIndices; - #[inline] - fn next(&mut self) -> Option<$iterty> { - self.0.next_back() - } - } - - $(#[$common_stability_attribute])* - impl<'a, P: Pattern<'a>> Clone for $reverse_iterator<'a, P> - where P::Searcher: Clone - { - fn clone(&self) -> Self { - $reverse_iterator(self.0.clone()) - } - } - - #[stable(feature = "fused", since = "1.26.0")] - impl<'a, P: Pattern<'a>> FusedIterator for $forward_iterator<'a, P> {} - - #[stable(feature = "fused", since = "1.26.0")] - impl<'a, P: Pattern<'a>> FusedIterator for $reverse_iterator<'a, P> - where P::Searcher: ReverseSearcher<'a> {} - - generate_pattern_iterators!($($t)* with $(#[$common_stability_attribute])*, - $forward_iterator, - $reverse_iterator, $iterty); - }; - { - double ended; with $(#[$common_stability_attribute:meta])*, - $forward_iterator:ident, - $reverse_iterator:ident, $iterty:ty - } => { - $(#[$common_stability_attribute])* - impl<'a, P: Pattern<'a>> DoubleEndedIterator for $forward_iterator<'a, P> - where P::Searcher: DoubleEndedSearcher<'a> - { - #[inline] - fn next_back(&mut self) -> Option<$iterty> { - self.0.next_back() - } - } - - $(#[$common_stability_attribute])* - impl<'a, P: Pattern<'a>> DoubleEndedIterator for $reverse_iterator<'a, P> - where P::Searcher: DoubleEndedSearcher<'a> - { - #[inline] - fn next_back(&mut self) -> Option<$iterty> { - self.0.next() - } - } - }; - { - single ended; with $(#[$common_stability_attribute:meta])*, - $forward_iterator:ident, - $reverse_iterator:ident, $iterty:ty - } => {} -} + /// [`rmatch_indices`](../../std/primitive.str.html#method.rmatch_indices) + type RMatchIndices; -derive_pattern_clone!{ - clone SplitInternal - with |s| SplitInternal { matcher: s.matcher.clone(), ..*s } -} - -struct SplitInternal<'a, P: Pattern<'a>> { - start: usize, - end: usize, - matcher: P::Searcher, - allow_trailing_empty: bool, - finished: bool, -} - -impl<'a, P: Pattern<'a>> fmt::Debug for SplitInternal<'a, P> where P::Searcher: fmt::Debug { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SplitInternal") - .field("start", &self.start) - .field("end", &self.end) - .field("matcher", &self.matcher) - .field("allow_trailing_empty", &self.allow_trailing_empty) - .field("finished", &self.finished) - .finish() - } -} + /// [`matches`](../../std/primitive.str.html#method.matches) + type Matches; -impl<'a, P: Pattern<'a>> SplitInternal<'a, P> { - #[inline] - fn get_end(&mut self) -> Option<&'a str> { - if !self.finished && (self.allow_trailing_empty || self.end - self.start > 0) { - self.finished = true; - unsafe { - let string = self.matcher.haystack().get_unchecked(self.start..self.end); - Some(string) - } - } else { - None - } - } - - #[inline] - fn next(&mut self) -> Option<&'a str> { - if self.finished { return None } - - let haystack = self.matcher.haystack(); - match self.matcher.next_match() { - Some((a, b)) => unsafe { - let elt = haystack.get_unchecked(self.start..a); - self.start = b; - Some(elt) - }, - None => self.get_end(), - } - } - - #[inline] - fn next_back(&mut self) -> Option<&'a str> - where P::Searcher: ReverseSearcher<'a> - { - if self.finished { return None } - - if !self.allow_trailing_empty { - self.allow_trailing_empty = true; - match self.next_back() { - Some(elt) if !elt.is_empty() => return Some(elt), - _ => if self.finished { return None } - } - } - - let haystack = self.matcher.haystack(); - match self.matcher.next_match_back() { - Some((a, b)) => unsafe { - let elt = haystack.get_unchecked(b..self.end); - self.end = a; - Some(elt) - }, - None => unsafe { - self.finished = true; - Some(haystack.get_unchecked(self.start..self.end)) - }, - } - } -} - -generate_pattern_iterators! { - forward: - /// Created with the method [`split`]. - /// - /// [`split`]: ../../std/primitive.str.html#method.split - struct Split; - reverse: - /// Created with the method [`rsplit`]. - /// - /// [`rsplit`]: ../../std/primitive.str.html#method.rsplit - struct RSplit; - stability: - #[stable(feature = "rust1", since = "1.0.0")] - internal: - SplitInternal yielding (&'a str); - delegate double ended; -} - -generate_pattern_iterators! { - forward: - /// Created with the method [`split_terminator`]. - /// - /// [`split_terminator`]: ../../std/primitive.str.html#method.split_terminator - struct SplitTerminator; - reverse: - /// Created with the method [`rsplit_terminator`]. - /// - /// [`rsplit_terminator`]: ../../std/primitive.str.html#method.rsplit_terminator - struct RSplitTerminator; - stability: - #[stable(feature = "rust1", since = "1.0.0")] - internal: - SplitInternal yielding (&'a str); - delegate double ended; -} - -derive_pattern_clone!{ - clone SplitNInternal - with |s| SplitNInternal { iter: s.iter.clone(), ..*s } -} - -struct SplitNInternal<'a, P: Pattern<'a>> { - iter: SplitInternal<'a, P>, - /// The number of splits remaining - count: usize, -} - -impl<'a, P: Pattern<'a>> fmt::Debug for SplitNInternal<'a, P> where P::Searcher: fmt::Debug { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SplitNInternal") - .field("iter", &self.iter) - .field("count", &self.count) - .finish() - } -} - -impl<'a, P: Pattern<'a>> SplitNInternal<'a, P> { - #[inline] - fn next(&mut self) -> Option<&'a str> { - match self.count { - 0 => None, - 1 => { self.count = 0; self.iter.get_end() } - _ => { self.count -= 1; self.iter.next() } - } - } - - #[inline] - fn next_back(&mut self) -> Option<&'a str> - where P::Searcher: ReverseSearcher<'a> - { - match self.count { - 0 => None, - 1 => { self.count = 0; self.iter.get_end() } - _ => { self.count -= 1; self.iter.next_back() } - } - } -} - -generate_pattern_iterators! { - forward: - /// Created with the method [`splitn`]. - /// - /// [`splitn`]: ../../std/primitive.str.html#method.splitn - struct SplitN; - reverse: - /// Created with the method [`rsplitn`]. - /// - /// [`rsplitn`]: ../../std/primitive.str.html#method.rsplitn - struct RSplitN; - stability: - #[stable(feature = "rust1", since = "1.0.0")] - internal: - SplitNInternal yielding (&'a str); - delegate single ended; -} - -derive_pattern_clone!{ - clone MatchIndicesInternal - with |s| MatchIndicesInternal(s.0.clone()) -} - -struct MatchIndicesInternal<'a, P: Pattern<'a>>(P::Searcher); - -impl<'a, P: Pattern<'a>> fmt::Debug for MatchIndicesInternal<'a, P> where P::Searcher: fmt::Debug { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_tuple("MatchIndicesInternal") - .field(&self.0) - .finish() - } -} - -impl<'a, P: Pattern<'a>> MatchIndicesInternal<'a, P> { - #[inline] - fn next(&mut self) -> Option<(usize, &'a str)> { - self.0.next_match().map(|(start, end)| unsafe { - (start, self.0.haystack().get_unchecked(start..end)) - }) - } - - #[inline] - fn next_back(&mut self) -> Option<(usize, &'a str)> - where P::Searcher: ReverseSearcher<'a> - { - self.0.next_match_back().map(|(start, end)| unsafe { - (start, self.0.haystack().get_unchecked(start..end)) - }) - } -} - -generate_pattern_iterators! { - forward: - /// Created with the method [`match_indices`]. - /// - /// [`match_indices`]: ../../std/primitive.str.html#method.match_indices - struct MatchIndices; - reverse: - /// Created with the method [`rmatch_indices`]. - /// - /// [`rmatch_indices`]: ../../std/primitive.str.html#method.rmatch_indices - struct RMatchIndices; - stability: - #[stable(feature = "str_match_indices", since = "1.5.0")] - internal: - MatchIndicesInternal yielding ((usize, &'a str)); - delegate double ended; -} - -derive_pattern_clone!{ - clone MatchesInternal - with |s| MatchesInternal(s.0.clone()) -} - -struct MatchesInternal<'a, P: Pattern<'a>>(P::Searcher); - -impl<'a, P: Pattern<'a>> fmt::Debug for MatchesInternal<'a, P> where P::Searcher: fmt::Debug { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_tuple("MatchesInternal") - .field(&self.0) - .finish() - } -} - -impl<'a, P: Pattern<'a>> MatchesInternal<'a, P> { - #[inline] - fn next(&mut self) -> Option<&'a str> { - self.0.next_match().map(|(a, b)| unsafe { - // Indices are known to be on utf8 boundaries - self.0.haystack().get_unchecked(a..b) - }) - } - - #[inline] - fn next_back(&mut self) -> Option<&'a str> - where P::Searcher: ReverseSearcher<'a> - { - self.0.next_match_back().map(|(a, b)| unsafe { - // Indices are known to be on utf8 boundaries - self.0.haystack().get_unchecked(a..b) - }) - } -} - -generate_pattern_iterators! { - forward: - /// Created with the method [`matches`]. - /// - /// [`matches`]: ../../std/primitive.str.html#method.matches - struct Matches; - reverse: - /// Created with the method [`rmatches`]. - /// - /// [`rmatches`]: ../../std/primitive.str.html#method.rmatches - struct RMatches; - stability: - #[stable(feature = "str_matches", since = "1.2.0")] - internal: - MatchesInternal yielding (&'a str); - delegate double ended; + /// [`rmatches`](../../std/primitive.str.html#method.rmatches) + type RMatches; } /// An iterator over the lines of a string, as string slices. @@ -2824,8 +2396,12 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn contains<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { - pat.is_contained_in(self) + pub fn contains<'a, P: Needle<&'a str>>(&'a self, pat: P) -> bool + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::contains(self, pat) } /// Returns `true` if the given pattern matches a prefix of this @@ -2844,8 +2420,12 @@ impl str { /// assert!(!bananas.starts_with("nana")); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - pub fn starts_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { - pat.is_prefix_of(self) + pub fn starts_with<'a, P: Needle<&'a str>>(&'a self, pat: P) -> bool + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::starts_with(self, pat) } /// Returns `true` if the given pattern matches a suffix of this @@ -2864,10 +2444,12 @@ impl str { /// assert!(!bananas.ends_with("nana")); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - pub fn ends_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool - where P::Searcher: ReverseSearcher<'a> + pub fn ends_with<'a, P: Needle<&'a str>>(&'a self, pat: P) -> bool + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: ReverseConsumer<str>, { - pat.is_suffix_of(self) + ext::ends_with(self, pat) } /// Returns the byte index of the first character of this string slice that @@ -2913,8 +2495,12 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option<usize> { - pat.into_searcher(self).next_match().map(|(i, _)| i) + pub fn find<'a, P: Needle<&'a str>>(&'a self, pat: P) -> Option<usize> + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::find(self, pat) } /// Returns the byte index of the last character of this string slice that @@ -2957,10 +2543,123 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn rfind<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option<usize> - where P::Searcher: ReverseSearcher<'a> + pub fn rfind<'a, P: Needle<&'a str>>(&'a self, pat: P) -> Option<usize> + where + P::Searcher: ReverseSearcher<str>, + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::rfind(self, pat) + } + + /// Returns the byte range of the first substring of this string slice that + /// the pattern can be found. + /// + /// Returns [`None`] if the pattern doesn't match. + /// + /// The pattern can be a `&str`, [`char`], or a closure that determines if + /// a substring matches. + /// + /// [`None`]: option/enum.Option.html#variant.None + /// + /// # Examples + /// + /// Simple patterns: + /// + /// ``` + /// #![feature(str_find_range)] + /// + /// let s = "Löwe 老虎 Léopard"; + /// + /// assert_eq!(s.find_range('L'), Some(0..1)); + /// assert_eq!(s.find_range('é'), Some(14..16)); + /// assert_eq!(s.find_range("Léopard"), Some(13..21)); + /// ``` + /// + /// More complex patterns using point-free style and closures: + /// + /// ``` + /// #![feature(str_find_range)] + /// + /// let s = "Löwe 老虎 Léopard"; + /// + /// assert_eq!(s.find_range(char::is_whitespace), Some(5..6)); + /// assert_eq!(s.find_range(char::is_lowercase), Some(1..3)); + /// assert_eq!(s.find_range(|c: char| c.is_whitespace() || c.is_lowercase()), Some(1..3)); + /// assert_eq!(s.find_range(|c: char| (c < 'o') && (c > 'a')), Some(4..5)); + /// ``` + /// + /// Not finding the pattern: + /// + /// ``` + /// #![feature(str_find_range)] + /// + /// let s = "Löwe 老虎 Léopard"; + /// let x: &[_] = &['1', '2']; + /// + /// assert_eq!(s.find_range(x), None); + /// ``` + #[unstable(feature = "str_find_range", issue = "56345")] + #[inline] + pub fn find_range<'a, P: Needle<&'a str>>(&'a self, pat: P) -> Option<Range<usize>> + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::find_range(self, pat) + } + + /// Returns the byte range of the last substring of this string slice that + /// the pattern can be found. + /// + /// Returns [`None`] if the pattern doesn't match. + /// + /// The pattern can be a `&str`, [`char`], or a closure that determines if + /// a substring matches. + /// + /// [`None`]: option/enum.Option.html#variant.None + /// + /// # Examples + /// + /// Simple patterns: + /// + /// ``` + /// #![feature(str_find_range)] + /// + /// let s = "Löwe 老虎 Léopard"; + /// + /// assert_eq!(s.rfind_range('L'), Some(13..14)); + /// assert_eq!(s.rfind_range('é'), Some(14..16)); + /// ``` + /// + /// More complex patterns with closures: + /// + /// ``` + /// #![feature(str_find_range)] + /// + /// let s = "Löwe 老虎 Léopard"; + /// + /// assert_eq!(s.rfind_range(char::is_whitespace), Some(12..13)); + /// assert_eq!(s.rfind_range(char::is_lowercase), Some(20..21)); + /// ``` + /// + /// Not finding the pattern: + /// + /// ``` + /// #![feature(str_find_range)] + /// + /// let s = "Löwe 老虎 Léopard"; + /// let x: &[_] = &['1', '2']; + /// + /// assert_eq!(s.rfind_range(x), None); + /// ``` + #[unstable(feature = "str_find_range", issue = "56345")] + #[inline] + pub fn rfind_range<'a, P: Needle<&'a str>>(&'a self, pat: P) -> Option<Range<usize>> + where + P::Searcher: ReverseSearcher<str>, + P::Consumer: Consumer<str>, // FIXME: RFC 2089 { - pat.into_searcher(self).next_match_back().map(|(i, _)| i) + ext::rfind_range(self, pat) } /// An iterator over substrings of this string slice, separated by @@ -3070,14 +2769,12 @@ impl str { /// [`split_whitespace`]: #method.split_whitespace #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn split<'a, P: Pattern<'a>>(&'a self, pat: P) -> Split<'a, P> { - Split(SplitInternal { - start: 0, - end: self.len(), - matcher: pat.into_searcher(self), - allow_trailing_empty: true, - finished: false, - }) + pub fn split<'a, P: Needle<&'a str>>(&'a self, pat: P) -> Split<'a, P> + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::split(self, pat) } /// An iterator over substrings of the given string slice, separated by @@ -3124,10 +2821,12 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn rsplit<'a, P: Pattern<'a>>(&'a self, pat: P) -> RSplit<'a, P> - where P::Searcher: ReverseSearcher<'a> + pub fn rsplit<'a, P: Needle<&'a str>>(&'a self, pat: P) -> RSplit<'a, P> + where + P::Searcher: ReverseSearcher<str>, + P::Consumer: Consumer<str>, // FIXME: RFC 2089 { - RSplit(self.split(pat).0) + ext::rsplit(self, pat) } /// An iterator over substrings of the given string slice, separated by @@ -3170,11 +2869,12 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn split_terminator<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitTerminator<'a, P> { - SplitTerminator(SplitInternal { - allow_trailing_empty: false, - ..self.split(pat).0 - }) + pub fn split_terminator<'a, P: Needle<&'a str>>(&'a self, pat: P) -> SplitTerminator<'a, P> + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::split_terminator(self, pat) } /// An iterator over substrings of `self`, separated by characters @@ -3185,10 +2885,10 @@ impl str { /// Additional libraries might provide more complex patterns like /// regular expressions. /// - /// Equivalent to [`split`], except that the trailing substring is + /// Equivalent to [`rsplit`], except that the trailing substring is /// skipped if empty. /// - /// [`split`]: #method.split + /// [`rsplit`]: #method.rsplit /// /// This method can be used for string data that is _terminated_, /// rather than _separated_ by a pattern. @@ -3215,10 +2915,12 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn rsplit_terminator<'a, P: Pattern<'a>>(&'a self, pat: P) -> RSplitTerminator<'a, P> - where P::Searcher: ReverseSearcher<'a> + pub fn rsplit_terminator<'a, P: Needle<&'a str>>(&'a self, pat: P) -> RSplitTerminator<'a, P> + where + P::Searcher: ReverseSearcher<str>, + P::Consumer: Consumer<str>, // FIXME: RFC 2089 { - RSplitTerminator(self.split_terminator(pat).0) + ext::rsplit_terminator(self, pat) } /// An iterator over substrings of the given string slice, separated by a @@ -3266,11 +2968,12 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn splitn<'a, P: Pattern<'a>>(&'a self, n: usize, pat: P) -> SplitN<'a, P> { - SplitN(SplitNInternal { - iter: self.split(pat).0, - count: n, - }) + pub fn splitn<'a, P: Needle<&'a str>>(&'a self, n: usize, pat: P) -> SplitN<'a, P> + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::splitn(self, n, pat) } /// An iterator over substrings of this string slice, separated by a @@ -3315,10 +3018,107 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn rsplitn<'a, P: Pattern<'a>>(&'a self, n: usize, pat: P) -> RSplitN<'a, P> - where P::Searcher: ReverseSearcher<'a> + pub fn rsplitn<'a, P: Needle<&'a str>>(&'a self, n: usize, pat: P) -> RSplitN<'a, P> + where + P::Searcher: ReverseSearcher<str>, + P::Consumer: Consumer<str>, // FIXME: RFC 2089 { - RSplitN(self.splitn(n, pat).0) + ext::rsplitn(self, n, pat) + } + + // FIXME: Someone should enhance the docs before stabilizing. + + /// An iterator over substrings of this mutable string slice, separated by + /// characters matched by a pattern. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn split_mut<'a, P>(&'a mut self, pat: P) -> ext::Split<&'a mut str, P::Searcher> + where + P: Needle<&'a mut str>, + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::split(self, pat) + } + + /// An iterator over substrings of the given mutable string slice, separated by + /// characters matched by a pattern and yielded in reverse order. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn rsplit_mut<'a, P>(&'a mut self, pat: P) -> ext::RSplit<&'a mut str, P::Searcher> + where + P: Needle<&'a mut str>, + P::Searcher: ReverseSearcher<str>, + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::rsplit(self, pat) + } + + /// An iterator over substrings of the given mutable string slice, separated by + /// characters matched by a pattern. + /// + /// Equivalent to [`split_mut`], except that the trailing substring + /// is skipped if empty. + /// + /// [`split_mut`]: #method.split_mut + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn split_terminator_mut<'a, P>(&'a mut self, pat: P) + -> ext::SplitTerminator<&'a mut str, P::Searcher> + where + P: Needle<&'a mut str>, + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::split_terminator(self, pat) + } + + /// An iterator over substrings of the given mutable string slice, separated by + /// characters matched by a pattern and yielded in reverse order. + /// + /// Equivalent to [`rsplit_mut`], except that the trailing substring + /// is skipped if empty. + /// + /// [`rsplit_mut`]: #method.rsplit_mut + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn rsplit_terminator_mut<'a, P>(&'a mut self, pat: P) + -> ext::RSplitTerminator<&'a mut str, P::Searcher> + where + P: Needle<&'a mut str>, + P::Searcher: ReverseSearcher<str>, + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::rsplit_terminator(self, pat) + } + + /// An iterator over substrings of the given mutable string slice, separated by a + /// pattern, restricted to returning at most `n` items. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn splitn_mut<'a, P>(&'a mut self, n: usize, pat: P) + -> ext::SplitN<&'a mut str, P::Searcher> + where + P: Needle<&'a mut str>, + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::splitn(self, n, pat) + } + + /// An iterator over substrings of this mutable string slice, separated by a + /// pattern, starting from the end of the string, restricted to returning + /// at most `n` items. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn rsplitn_mut<'a, P>(&'a mut self, n: usize, pat: P) + -> ext::RSplitN<&'a mut str, P::Searcher> + where + P: Needle<&'a mut str>, + P::Searcher: ReverseSearcher<str>, + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::rsplitn(self, n, pat) } /// An iterator over the disjoint matches of a pattern within the given string @@ -3353,8 +3153,12 @@ impl str { /// ``` #[stable(feature = "str_matches", since = "1.2.0")] #[inline] - pub fn matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> Matches<'a, P> { - Matches(MatchesInternal(pat.into_searcher(self))) + pub fn matches<'a, P: Needle<&'a str>>(&'a self, pat: P) -> Matches<'a, P> + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::matches(self, pat) } /// An iterator over the disjoint matches of a pattern within this string slice, @@ -3388,10 +3192,12 @@ impl str { /// ``` #[stable(feature = "str_matches", since = "1.2.0")] #[inline] - pub fn rmatches<'a, P: Pattern<'a>>(&'a self, pat: P) -> RMatches<'a, P> - where P::Searcher: ReverseSearcher<'a> + pub fn rmatches<'a, P: Needle<&'a str>>(&'a self, pat: P) -> RMatches<'a, P> + where + P::Searcher: ReverseSearcher<str>, + P::Consumer: Consumer<str>, // FIXME: RFC 2089 { - RMatches(self.matches(pat).0) + ext::rmatches(self, pat) } /// An iterator over the disjoint matches of a pattern within this string @@ -3432,8 +3238,12 @@ impl str { /// ``` #[stable(feature = "str_match_indices", since = "1.5.0")] #[inline] - pub fn match_indices<'a, P: Pattern<'a>>(&'a self, pat: P) -> MatchIndices<'a, P> { - MatchIndices(MatchIndicesInternal(pat.into_searcher(self))) + pub fn match_indices<'a, P: Needle<&'a str>>(&'a self, pat: P) -> MatchIndices<'a, P> + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::match_indices(self, pat) } /// An iterator over the disjoint matches of a pattern within `self`, @@ -3473,10 +3283,191 @@ impl str { /// ``` #[stable(feature = "str_match_indices", since = "1.5.0")] #[inline] - pub fn rmatch_indices<'a, P: Pattern<'a>>(&'a self, pat: P) -> RMatchIndices<'a, P> - where P::Searcher: ReverseSearcher<'a> + pub fn rmatch_indices<'a, P: Needle<&'a str>>(&'a self, pat: P) -> RMatchIndices<'a, P> + where + P::Searcher: ReverseSearcher<str>, + P::Consumer: Consumer<str>, // FIXME: RFC 2089 { - RMatchIndices(self.match_indices(pat).0) + ext::rmatch_indices(self, pat) + } + + /// An iterator over the disjoint matches of a pattern within this string + /// slice as well as the range that the match covers. + /// + /// For matches of `pat` within `self` that overlap, only the ranges + /// corresponding to the first match are returned. + /// + /// The pattern can be a `&str`, [`char`], or a closure that determines + /// if a substring matches. + /// + /// # Iterator behavior + /// + /// The returned iterator will be a [`DoubleEndedIterator`] if the pattern + /// allows a reverse search and forward/reverse search yields the same + /// elements. This is true for, eg, [`char`] but not for `&str`. + /// + /// [`DoubleEndedIterator`]: iter/trait.DoubleEndedIterator.html + /// + /// If the pattern allows a reverse search but its results might differ + /// from a forward search, the [`rmatch_ranges`] method can be used. + /// + /// [`rmatch_ranges`]: #method.rmatch_ranges + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(str_find_range)] + /// + /// let v: Vec<_> = "abcXXXabcYYYabc".match_ranges("abc").collect(); + /// assert_eq!(v, [(0..3, "abc"), (6..9, "abc"), (12..15, "abc")]); + /// + /// let v: Vec<_> = "1abcabc2".match_ranges("abc").collect(); + /// assert_eq!(v, [(1..4, "abc"), (4..7, "abc")]); + /// + /// let v: Vec<_> = "ababa".match_ranges("aba").collect(); + /// assert_eq!(v, [(0..3, "aba")]); // only the first `aba` + /// ``` + #[unstable(feature = "str_find_range", issue = "56345")] + #[inline] + pub fn match_ranges<'a, P>(&'a self, pat: P) -> ext::MatchRanges<&'a str, P::Searcher> + where + P: Needle<&'a str>, + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::match_ranges(self, pat) + } + + /// An iterator over the disjoint matches of a pattern within `self`, + /// yielded in reverse order along with the range of the match. + /// + /// For matches of `pat` within `self` that overlap, only the ranges + /// corresponding to the last match are returned. + /// + /// The pattern can be a `&str`, [`char`], or a closure that determines if a + /// substring matches. + /// + /// # Iterator behavior + /// + /// The returned iterator requires that the pattern supports a reverse + /// search, and it will be a [`DoubleEndedIterator`] if a forward/reverse + /// search yields the same elements. + /// + /// [`DoubleEndedIterator`]: iter/trait.DoubleEndedIterator.html + /// + /// For iterating from the front, the [`match_ranges`] method can be used. + /// + /// [`match_ranges`]: #method.match_ranges + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(str_find_range)] + /// + /// let v: Vec<_> = "abcXXXabcYYYabc".rmatch_ranges("abc").collect(); + /// assert_eq!(v, [(12..15, "abc"), (6..9, "abc"), (0..3, "abc")]); + /// + /// let v: Vec<_> = "1abcabc2".rmatch_ranges("abc").collect(); + /// assert_eq!(v, [(4..7, "abc"), (1..4, "abc")]); + /// + /// let v: Vec<_> = "ababa".rmatch_ranges("aba").collect(); + /// assert_eq!(v, [(2..5, "aba")]); // only the last `aba` + /// ``` + #[unstable(feature = "str_find_range", issue = "56345")] + #[inline] + pub fn rmatch_ranges<'a, P>(&'a self, pat: P) -> ext::RMatchRanges<&'a str, P::Searcher> + where + P: Needle<&'a str>, + P::Searcher: ReverseSearcher<str>, + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::rmatch_ranges(self, pat) + } + + /// An iterator over the disjoint matches of a pattern within the given + /// mutable string slice. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn matches_mut<'a, P>(&'a mut self, pat: P) -> ext::Matches<&'a mut str, P::Searcher> + where + P: Needle<&'a mut str>, + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::matches(self, pat) + } + + /// An iterator over the disjoint matches of a pattern within this + /// mutable string slice, yielded in reverse order. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn rmatches_mut<'a, P>(&'a mut self, pat: P) -> ext::RMatches<&'a mut str, P::Searcher> + where + P: Needle<&'a mut str>, + P::Searcher: ReverseSearcher<str>, + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::rmatches(self, pat) + } + + /// An iterator over the disjoint matches of a pattern within this mutable string + /// slice as well as the index that the match starts at. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn match_indices_mut<'a, P>(&'a mut self, pat: P) + -> ext::MatchIndices<&'a mut str, P::Searcher> + where + P: Needle<&'a mut str>, + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::match_indices(self, pat) + } + + /// An iterator over the disjoint matches of a pattern within this mutable string slice, + /// yielded in reverse order along with the index of the match. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn rmatch_indices_mut<'a, P>(&'a mut self, pat: P) + -> ext::RMatchIndices<&'a mut str, P::Searcher> + where + P: Needle<&'a mut str>, + P::Searcher: ReverseSearcher<str>, + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::rmatch_indices(self, pat) + } + + /// An iterator over the disjoint matches of a pattern within this mutable string + /// slice as well as the range that the match covers. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn match_ranges_mut<'a, P>(&'a mut self, pat: P) + -> ext::MatchRanges<&'a mut str, P::Searcher> + where + P: Needle<&'a mut str>, + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::match_ranges(self, pat) + } + + /// An iterator over the disjoint matches of a pattern within this mutable string slice, + /// yielded in reverse order along with the range of the match. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn rmatch_ranges_mut<'a, P>(&'a mut self, pat: P) + -> ext::RMatchRanges<&'a mut str, P::Searcher> + where + P: Needle<&'a mut str>, + P::Searcher: ReverseSearcher<str>, + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::rmatch_ranges(self, pat) } /// Returns a string slice with leading and trailing whitespace removed. @@ -3682,24 +3673,12 @@ impl str { #[must_use = "this returns the trimmed string as a new slice, \ without modifying the original"] #[stable(feature = "rust1", since = "1.0.0")] - pub fn trim_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str - where P::Searcher: DoubleEndedSearcher<'a> + pub fn trim_matches<'a, P: Needle<&'a str>>(&'a self, pat: P) -> &'a str + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: DoubleEndedConsumer<str>, { - let mut i = 0; - let mut j = 0; - let mut matcher = pat.into_searcher(self); - if let Some((a, b)) = matcher.next_reject() { - i = a; - j = b; // Remember earliest known match, correct it below if - // last match is different - } - if let Some((_, b)) = matcher.next_reject_back() { - j = b; - } - unsafe { - // Searcher is known to return valid indices - self.get_unchecked(i..j) - } + ext::trim(self, pat) } /// Returns a string slice with all prefixes that match a pattern @@ -3729,16 +3708,12 @@ impl str { #[must_use = "this returns the trimmed string as a new slice, \ without modifying the original"] #[stable(feature = "trim_direction", since = "1.30.0")] - pub fn trim_start_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str { - let mut i = self.len(); - let mut matcher = pat.into_searcher(self); - if let Some((a, _)) = matcher.next_reject() { - i = a; - } - unsafe { - // Searcher is known to return valid indices - self.get_unchecked(i..self.len()) - } + pub fn trim_start_matches<'a, P: Needle<&'a str>>(&'a self, pat: P) -> &'a str + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::trim_start(self, pat) } /// Returns a string slice with all suffixes that match a pattern @@ -3774,18 +3749,12 @@ impl str { #[must_use = "this returns the trimmed string as a new slice, \ without modifying the original"] #[stable(feature = "trim_direction", since = "1.30.0")] - pub fn trim_end_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str - where P::Searcher: ReverseSearcher<'a> + pub fn trim_end_matches<'a, P: Needle<&'a str>>(&'a self, pat: P) -> &'a str + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: ReverseConsumer<str>, { - let mut j = 0; - let mut matcher = pat.into_searcher(self); - if let Some((_, b)) = matcher.next_reject_back() { - j = b; - } - unsafe { - // Searcher is known to return valid indices - self.get_unchecked(0..j) - } + ext::trim_end(self, pat) } /// Returns a string slice with all prefixes that match a pattern @@ -3820,7 +3789,11 @@ impl str { reason = "superseded by `trim_start_matches`", suggestion = "trim_start_matches", )] - pub fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str { + pub fn trim_left_matches<'a, P: Needle<&'a str>>(&'a self, pat: P) -> &'a str + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { self.trim_start_matches(pat) } @@ -3862,12 +3835,71 @@ impl str { reason = "superseded by `trim_end_matches`", suggestion = "trim_end_matches", )] - pub fn trim_right_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str - where P::Searcher: ReverseSearcher<'a> + pub fn trim_right_matches<'a, P: Needle<&'a str>>(&'a self, pat: P) -> &'a str + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: ReverseConsumer<str>, { self.trim_end_matches(pat) } + /// Returns a mutable string slice with leading and trailing whitespace removed. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn trim_mut(&mut self) -> &mut str { + self.trim_matches_mut(|c: char| c.is_whitespace()) + } + + /// Returns a mutable string slice with leading whitespace removed. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn trim_start_mut(&mut self) -> &mut str { + self.trim_start_matches_mut(|c: char| c.is_whitespace()) + } + + /// Returns a mutable string slice with trailing whitespace removed. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn trim_end_mut(&mut self) -> &mut str { + self.trim_end_matches_mut(|c: char| c.is_whitespace()) + } + + /// Returns a mutable string slice with all prefixes and suffixes that match a + /// pattern repeatedly removed. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn trim_matches_mut<'a, P: Needle<&'a mut str>>(&'a mut self, pat: P) -> &'a mut str + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: DoubleEndedConsumer<str>, + { + ext::trim(self, pat) + } + + /// Returns a mutable string slice with all prefixes that match a pattern + /// repeatedly removed. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn trim_start_matches_mut<'a, P: Needle<&'a mut str>>(&'a mut self, pat: P) -> &'a mut str + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: Consumer<str>, // FIXME: RFC 2089 + { + ext::trim_start(self, pat) + } + + /// Returns a mutable string slice with all suffixes that match a pattern + /// repeatedly removed. + #[unstable(feature = "mut_str_needle_methods", issue = "56345")] + #[inline] + pub fn trim_end_matches_mut<'a, P: Needle<&'a mut str>>(&'a mut self, pat: P) -> &'a mut str + where + P::Searcher: Searcher<str>, // FIXME: RFC 2089 + P::Consumer: ReverseConsumer<str>, + { + ext::trim_end(self, pat) + } + /// Parses this string slice into another type. /// /// Because `parse` is so general, it can cause problems with type diff --git a/src/libcore/str/needles.rs b/src/libcore/str/needles.rs new file mode 100644 index 0000000000000..53d55da9fbdf6 --- /dev/null +++ b/src/libcore/str/needles.rs @@ -0,0 +1,365 @@ +use crate::needle::*; +use crate::ops::Range; +use crate::slice::needles::{SliceSearcher, NaiveSearcher, TwoWaySearcher}; +use crate::slice::memchr::{memchr, memrchr}; +use crate::fmt; + +//------------------------------------------------------------------------------ +// Character function searcher +//------------------------------------------------------------------------------ + +#[derive(Copy, Clone, Debug)] +pub struct MultiCharEq<'p>(&'p [char]); + +impl<'p> FnOnce<(char,)> for MultiCharEq<'p> { + type Output = bool; + #[inline] + extern "rust-call" fn call_once(self, args: (char,)) -> bool { + self.call(args) + } +} + +impl<'p> FnMut<(char,)> for MultiCharEq<'p> { + #[inline] + extern "rust-call" fn call_mut(&mut self, args: (char,)) -> bool { + self.call(args) + } +} + +impl<'p> Fn<(char,)> for MultiCharEq<'p> { + #[inline] + extern "rust-call" fn call(&self, (c,): (char,)) -> bool { + self.0.iter().any(|ch| *ch == c) + } +} + +#[derive(Clone)] +pub struct MultiCharSearcher<F> { + predicate: F, +} + +// we need to impl Debug for everything due to stability guarantee. +impl<F> fmt::Debug for MultiCharSearcher<F> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("MultiCharSearcher").finish() + } +} + +unsafe impl<F: FnMut(char) -> bool> Searcher<str> for MultiCharSearcher<F> { + #[inline] + fn search(&mut self, span: Span<&str>) -> Option<Range<usize>> { + let (hay, range) = span.into_parts(); + let st = range.start; + let h = &hay[range]; + let mut chars = h.chars(); + let c = chars.find(|c| (self.predicate)(*c))?; + let end = chars.as_str().as_ptr(); + let end = unsafe { end.offset_from(h.as_ptr()) as usize } + st; + Some((end - c.len_utf8())..end) + } +} + +unsafe impl<F: FnMut(char) -> bool> Consumer<str> for MultiCharSearcher<F> { + #[inline] + fn consume(&mut self, hay: Span<&str>) -> Option<usize> { + let (hay, range) = hay.into_parts(); + let start = range.start; + if start == range.end { + return None; + } + let c = unsafe { hay.get_unchecked(start..) }.chars().next().unwrap(); + if (self.predicate)(c) { + Some(start + c.len_utf8()) + } else { + None + } + } + + #[inline] + fn trim_start(&mut self, hay: &str) -> usize { + let mut chars = hay.chars(); + let unconsume_amount = chars + .find_map(|c| if !(self.predicate)(c) { Some(c.len_utf8()) } else { None }) + .unwrap_or(0); + let consumed = unsafe { chars.as_str().as_ptr().offset_from(hay.as_ptr()) as usize }; + consumed.wrapping_sub(unconsume_amount) + } +} + +unsafe impl<F: FnMut(char) -> bool> ReverseSearcher<str> for MultiCharSearcher<F> { + #[inline] + fn rsearch(&mut self, span: Span<&str>) -> Option<Range<usize>> { + let (hay, range) = span.into_parts(); + let st = range.start; + let h = &hay[range]; + let mut chars = h.chars(); + let c = chars.rfind(|c| (self.predicate)(*c))?; + let start = chars.as_str().len() + st; + Some(start..(start + c.len_utf8())) + } +} + +unsafe impl<F: FnMut(char) -> bool> ReverseConsumer<str> for MultiCharSearcher<F> { + #[inline] + fn rconsume(&mut self, hay: Span<&str>) -> Option<usize> { + let (hay, range) = hay.into_parts(); + let end = range.end; + if range.start == end { + return None; + } + let c = unsafe { hay.get_unchecked(..end) }.chars().next_back().unwrap(); + if (self.predicate)(c) { + Some(end - c.len_utf8()) + } else { + None + } + } + + #[inline] + fn trim_end(&mut self, hay: &str) -> usize { + // `find.map_or` is faster in trim_end in the microbenchmark, while + // `find.unwrap_or` is faster in trim_start. Don't ask me why. + let mut chars = hay.chars(); + let unconsume_amount = chars + .by_ref() + .rev() // btw, `rev().find()` is faster than `rfind()` + .find(|c| !(self.predicate)(*c)) + .map_or(0, |c| c.len_utf8()); + chars.as_str().len() + unconsume_amount + } +} + +unsafe impl<F: FnMut(char) -> bool> DoubleEndedSearcher<str> for MultiCharSearcher<F> {} +unsafe impl<F: FnMut(char) -> bool> DoubleEndedConsumer<str> for MultiCharSearcher<F> {} + +macro_rules! impl_needle_with_multi_char_searcher { + ($ty:ty) => { + impl<'h, F: FnMut(char) -> bool> Needle<$ty> for F { + type Searcher = MultiCharSearcher<F>; + type Consumer = MultiCharSearcher<F>; + + #[inline] + fn into_searcher(self) -> Self::Searcher { + MultiCharSearcher { predicate: self } + } + + #[inline] + fn into_consumer(self) -> Self::Consumer { + MultiCharSearcher { predicate: self } + } + } + + impl<'h, 'p> Needle<$ty> for &'p [char] { + type Searcher = MultiCharSearcher<MultiCharEq<'p>>; + type Consumer = MultiCharSearcher<MultiCharEq<'p>>; + + #[inline] + fn into_searcher(self) -> Self::Searcher { + MultiCharSearcher { predicate: MultiCharEq(self) } + } + + #[inline] + fn into_consumer(self) -> Self::Consumer { + MultiCharSearcher { predicate: MultiCharEq(self) } + } + } + } +} + +impl_needle_with_multi_char_searcher!(&'h str); +impl_needle_with_multi_char_searcher!(&'h mut str); + +//------------------------------------------------------------------------------ +// Character searcher +//------------------------------------------------------------------------------ + +#[derive(Debug, Clone)] +pub struct CharSearcher { + // safety invariant: `utf8_size` must be less than 5 + utf8_size: usize, + + /// A utf8 encoded copy of the `needle` + utf8_encoded: [u8; 4], + + /// The character currently being searched. + c: char, +} + +impl CharSearcher { + #[inline] + fn as_bytes(&self) -> &[u8] { + &self.utf8_encoded[..self.utf8_size] + } + + #[inline] + fn last_byte(&self) -> u8 { + self.utf8_encoded[self.utf8_size - 1] + } + + #[inline] + fn new(c: char) -> Self { + let mut utf8_encoded = [0u8; 4]; + let utf8_size = c.encode_utf8(&mut utf8_encoded).len(); + CharSearcher { + utf8_size, + utf8_encoded, + c, + } + } +} + +unsafe impl Searcher<str> for CharSearcher { + #[inline] + fn search(&mut self, span: Span<&str>) -> Option<Range<usize>> { + let (hay, range) = span.into_parts(); + let mut finger = range.start; + let bytes = hay.as_bytes(); + loop { + let index = memchr(self.last_byte(), &bytes[finger..range.end])?; + finger += index + 1; + if finger >= self.utf8_size { + let found = &bytes[(finger - self.utf8_size)..finger]; + if found == self.as_bytes() { + return Some((finger - self.utf8_size)..finger); + } + } + } + } +} + +unsafe impl Consumer<str> for CharSearcher { + #[inline] + fn consume(&mut self, span: Span<&str>) -> Option<usize> { + let mut consumer = Needle::<&[u8]>::into_consumer(self.as_bytes()); + consumer.consume(span.as_bytes()) + } + + #[inline] + fn trim_start(&mut self, hay: &str) -> usize { + let mut consumer = Needle::<&str>::into_consumer(|c: char| c == self.c); + consumer.trim_start(hay) + } +} + +unsafe impl ReverseSearcher<str> for CharSearcher { + #[inline] + fn rsearch(&mut self, span: Span<&str>) -> Option<Range<usize>> { + let (hay, range) = span.into_parts(); + let start = range.start; + let mut bytes = hay[range].as_bytes(); + loop { + let index = memrchr(self.last_byte(), bytes)? + 1; + if index >= self.utf8_size { + let found = &bytes[(index - self.utf8_size)..index]; + if found == self.as_bytes() { + let index = index + start; + return Some((index - self.utf8_size)..index); + } + } + bytes = &bytes[..(index - 1)]; + } + } +} + +unsafe impl ReverseConsumer<str> for CharSearcher { + #[inline] + fn rconsume(&mut self, span: Span<&str>) -> Option<usize> { + if self.utf8_size == 1 { + let mut consumer = Needle::<&[u8]>::into_consumer(|b: &u8| *b == self.c as u8); + consumer.rconsume(span.as_bytes()) + } else { + let mut consumer = Needle::<&str>::into_consumer(|c: char| c == self.c); + consumer.rconsume(span) + } + } + + #[inline] + fn trim_end(&mut self, haystack: &str) -> usize { + let mut consumer = Needle::<&str>::into_consumer(|c: char| c == self.c); + consumer.trim_end(haystack) + } +} + +unsafe impl DoubleEndedSearcher<str> for CharSearcher {} +unsafe impl DoubleEndedConsumer<str> for CharSearcher {} + +impl<H: Haystack<Target = str>> Needle<H> for char { + type Searcher = CharSearcher; + type Consumer = CharSearcher; + + #[inline] + fn into_searcher(self) -> Self::Searcher { + CharSearcher::new(self) + } + + #[inline] + fn into_consumer(self) -> Self::Consumer { + CharSearcher::new(self) + } +} + +//------------------------------------------------------------------------------ +// String searcher +//------------------------------------------------------------------------------ + +unsafe impl<'p> Searcher<str> for TwoWaySearcher<'p, u8> { + #[inline] + fn search(&mut self, span: Span<&str>) -> Option<Range<usize>> { + let (hay, range) = span.into_parts(); + self.next(hay.as_bytes(), range) + } +} + +unsafe impl<'p> ReverseSearcher<str> for TwoWaySearcher<'p, u8> { + #[inline] + fn rsearch(&mut self, span: Span<&str>) -> Option<Range<usize>> { + let (hay, range) = span.into_parts(); + self.next_back(hay.as_bytes(), range) + } +} + +unsafe impl<'p> Consumer<str> for NaiveSearcher<'p, u8> { + #[inline] + fn consume(&mut self, span: Span<&str>) -> Option<usize> { + self.consume(span.as_bytes()) + } + + #[inline] + fn trim_start(&mut self, hay: &str) -> usize { + self.trim_start(hay.as_bytes()) + } +} + +unsafe impl<'p> ReverseConsumer<str> for NaiveSearcher<'p, u8> { + #[inline] + fn rconsume(&mut self, span: Span<&str>) -> Option<usize> { + self.rconsume(span.as_bytes()) + } + + #[inline] + fn trim_end(&mut self, hay: &str) -> usize { + self.trim_end(hay.as_bytes()) + } +} + +macro_rules! impl_needle_for_str_searcher { + (<[$($gen:tt)*]> for $pat:ty) => { + impl<$($gen)*, H: Haystack<Target = str>> Needle<H> for $pat { + type Searcher = SliceSearcher<'p, u8>; + type Consumer = NaiveSearcher<'p, u8>; + + #[inline] + fn into_searcher(self) -> Self::Searcher { + SliceSearcher::new(self.as_bytes()) + } + + #[inline] + fn into_consumer(self) -> Self::Consumer { + NaiveSearcher::new(self.as_bytes()) + } + } + } +} + +impl_needle_for_str_searcher!(<['p]> for &'p str); +impl_needle_for_str_searcher!(<['q, 'p]> for &'q &'p str); diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs deleted file mode 100644 index ad9d956fda1c8..0000000000000 --- a/src/libcore/str/pattern.rs +++ /dev/null @@ -1,1402 +0,0 @@ -//! The string Pattern API. -//! -//! For more details, see the traits [`Pattern`], [`Searcher`], -//! [`ReverseSearcher`], and [`DoubleEndedSearcher`]. - -#![unstable(feature = "pattern", - reason = "API not fully fleshed out and ready to be stabilized", - issue = "27721")] - -use crate::cmp; -use crate::fmt; -use crate::slice::memchr; -use crate::usize; - -// Pattern - -/// A string pattern. -/// -/// A `Pattern<'a>` expresses that the implementing type -/// can be used as a string pattern for searching in a `&'a str`. -/// -/// For example, both `'a'` and `"aa"` are patterns that -/// would match at index `1` in the string `"baaaab"`. -/// -/// The trait itself acts as a builder for an associated -/// `Searcher` type, which does the actual work of finding -/// occurrences of the pattern in a string. -pub trait Pattern<'a>: Sized { - /// Associated searcher for this pattern - type Searcher: Searcher<'a>; - - /// Constructs the associated searcher from - /// `self` and the `haystack` to search in. - fn into_searcher(self, haystack: &'a str) -> Self::Searcher; - - /// Checks whether the pattern matches anywhere in the haystack - #[inline] - fn is_contained_in(self, haystack: &'a str) -> bool { - self.into_searcher(haystack).next_match().is_some() - } - - /// Checks whether the pattern matches at the front of the haystack - #[inline] - fn is_prefix_of(self, haystack: &'a str) -> bool { - match self.into_searcher(haystack).next() { - SearchStep::Match(0, _) => true, - _ => false, - } - } - - /// Checks whether the pattern matches at the back of the haystack - #[inline] - fn is_suffix_of(self, haystack: &'a str) -> bool - where Self::Searcher: ReverseSearcher<'a> - { - match self.into_searcher(haystack).next_back() { - SearchStep::Match(_, j) if haystack.len() == j => true, - _ => false, - } - } -} - -// Searcher - -/// Result of calling `Searcher::next()` or `ReverseSearcher::next_back()`. -#[derive(Copy, Clone, Eq, PartialEq, Debug)] -pub enum SearchStep { - /// Expresses that a match of the pattern has been found at - /// `haystack[a..b]`. - Match(usize, usize), - /// Expresses that `haystack[a..b]` has been rejected as a possible match - /// of the pattern. - /// - /// Note that there might be more than one `Reject` between two `Match`es, - /// there is no requirement for them to be combined into one. - Reject(usize, usize), - /// Expresses that every byte of the haystack has been visited, ending - /// the iteration. - Done -} - -/// A searcher for a string pattern. -/// -/// This trait provides methods for searching for non-overlapping -/// matches of a pattern starting from the front (left) of a string. -/// -/// It will be implemented by associated `Searcher` -/// types of the `Pattern` trait. -/// -/// The trait is marked unsafe because the indices returned by the -/// `next()` methods are required to lie on valid utf8 boundaries in -/// the haystack. This enables consumers of this trait to -/// slice the haystack without additional runtime checks. -pub unsafe trait Searcher<'a> { - /// Getter for the underlying string to be searched in - /// - /// Will always return the same `&str` - fn haystack(&self) -> &'a str; - - /// Performs the next search step starting from the front. - /// - /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern. - /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the - /// pattern, even partially. - /// - Returns `Done` if every byte of the haystack has been visited - /// - /// The stream of `Match` and `Reject` values up to a `Done` - /// will contain index ranges that are adjacent, non-overlapping, - /// covering the whole haystack, and laying on utf8 boundaries. - /// - /// A `Match` result needs to contain the whole matched pattern, - /// however `Reject` results may be split up into arbitrary - /// many adjacent fragments. Both ranges may have zero length. - /// - /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` - /// might produce the stream - /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]` - fn next(&mut self) -> SearchStep; - - /// Finds the next `Match` result. See `next()` - /// - /// Unlike next(), there is no guarantee that the returned ranges - /// of this and next_reject will overlap. This will return (start_match, end_match), - /// where start_match is the index of where the match begins, and end_match is - /// the index after the end of the match. - #[inline] - fn next_match(&mut self) -> Option<(usize, usize)> { - loop { - match self.next() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } - - /// Finds the next `Reject` result. See `next()` and `next_match()` - /// - /// Unlike next(), there is no guarantee that the returned ranges - /// of this and next_match will overlap. - #[inline] - fn next_reject(&mut self) -> Option<(usize, usize)> { - loop { - match self.next() { - SearchStep::Reject(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } -} - -/// A reverse searcher for a string pattern. -/// -/// This trait provides methods for searching for non-overlapping -/// matches of a pattern starting from the back (right) of a string. -/// -/// It will be implemented by associated `Searcher` -/// types of the `Pattern` trait if the pattern supports searching -/// for it from the back. -/// -/// The index ranges returned by this trait are not required -/// to exactly match those of the forward search in reverse. -/// -/// For the reason why this trait is marked unsafe, see them -/// parent trait `Searcher`. -pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { - /// Performs the next search step starting from the back. - /// - /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern. - /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the - /// pattern, even partially. - /// - Returns `Done` if every byte of the haystack has been visited - /// - /// The stream of `Match` and `Reject` values up to a `Done` - /// will contain index ranges that are adjacent, non-overlapping, - /// covering the whole haystack, and laying on utf8 boundaries. - /// - /// A `Match` result needs to contain the whole matched pattern, - /// however `Reject` results may be split up into arbitrary - /// many adjacent fragments. Both ranges may have zero length. - /// - /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` - /// might produce the stream - /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]` - fn next_back(&mut self) -> SearchStep; - - /// Finds the next `Match` result. See `next_back()` - #[inline] - fn next_match_back(&mut self) -> Option<(usize, usize)>{ - loop { - match self.next_back() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } - - /// Finds the next `Reject` result. See `next_back()` - #[inline] - fn next_reject_back(&mut self) -> Option<(usize, usize)>{ - loop { - match self.next_back() { - SearchStep::Reject(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } -} - -/// A marker trait to express that a `ReverseSearcher` -/// can be used for a `DoubleEndedIterator` implementation. -/// -/// For this, the impl of `Searcher` and `ReverseSearcher` need -/// to follow these conditions: -/// -/// - All results of `next()` need to be identical -/// to the results of `next_back()` in reverse order. -/// - `next()` and `next_back()` need to behave as -/// the two ends of a range of values, that is they -/// can not "walk past each other". -/// -/// # Examples -/// -/// `char::Searcher` is a `DoubleEndedSearcher` because searching for a -/// `char` only requires looking at one at a time, which behaves the same -/// from both ends. -/// -/// `(&str)::Searcher` is not a `DoubleEndedSearcher` because -/// the pattern `"aa"` in the haystack `"aaa"` matches as either -/// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. -pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} - - -///////////////////////////////////////////////////////////////////////////// -// Impl for char -///////////////////////////////////////////////////////////////////////////// - -/// Associated type for `<char as Pattern<'a>>::Searcher`. -#[derive(Clone, Debug)] -pub struct CharSearcher<'a> { - haystack: &'a str, - // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack` - // This invariant can be broken *within* next_match and next_match_back, however - // they must exit with fingers on valid code point boundaries. - - /// `finger` is the current byte index of the forward search. - /// Imagine that it exists before the byte at its index, i.e. - /// `haystack[finger]` is the first byte of the slice we must inspect during - /// forward searching - finger: usize, - /// `finger_back` is the current byte index of the reverse search. - /// Imagine that it exists after the byte at its index, i.e. - /// haystack[finger_back - 1] is the last byte of the slice we must inspect during - /// forward searching (and thus the first byte to be inspected when calling next_back()) - finger_back: usize, - /// The character being searched for - needle: char, - - // safety invariant: `utf8_size` must be less than 5 - /// The number of bytes `needle` takes up when encoded in utf8 - utf8_size: usize, - /// A utf8 encoded copy of the `needle` - utf8_encoded: [u8; 4], -} - -unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { - #[inline] - fn haystack(&self) -> &'a str { - self.haystack - } - #[inline] - fn next(&mut self) -> SearchStep { - let old_finger = self.finger; - let slice = unsafe { self.haystack.get_unchecked(old_finger..self.finger_back) }; - let mut iter = slice.chars(); - let old_len = iter.iter.len(); - if let Some(ch) = iter.next() { - // add byte offset of current character - // without re-encoding as utf-8 - self.finger += old_len - iter.iter.len(); - if ch == self.needle { - SearchStep::Match(old_finger, self.finger) - } else { - SearchStep::Reject(old_finger, self.finger) - } - } else { - SearchStep::Done - } - } - #[inline] - fn next_match(&mut self) -> Option<(usize, usize)> { - loop { - // get the haystack after the last character found - let bytes = if let Some(slice) = self.haystack.as_bytes() - .get(self.finger..self.finger_back) { - slice - } else { - return None; - }; - // the last byte of the utf8 encoded needle - let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; - if let Some(index) = memchr::memchr(last_byte, bytes) { - // The new finger is the index of the byte we found, - // plus one, since we memchr'd for the last byte of the character. - // - // Note that this doesn't always give us a finger on a UTF8 boundary. - // If we *didn't* find our character - // we may have indexed to the non-last byte of a 3-byte or 4-byte character. - // We can't just skip to the next valid starting byte because a character like - // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find - // the second byte when searching for the third. - // - // However, this is totally okay. While we have the invariant that - // self.finger is on a UTF8 boundary, this invariant is not relied upon - // within this method (it is relied upon in CharSearcher::next()). - // - // We only exit this method when we reach the end of the string, or if we - // find something. When we find something the `finger` will be set - // to a UTF8 boundary. - self.finger += index + 1; - if self.finger >= self.utf8_size { - let found_char = self.finger - self.utf8_size; - if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) { - if slice == &self.utf8_encoded[0..self.utf8_size] { - return Some((found_char, self.finger)); - } - } - } - } else { - // found nothing, exit - self.finger = self.finger_back; - return None; - } - } - } - - // let next_reject use the default implementation from the Searcher trait -} - -unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { - #[inline] - fn next_back(&mut self) -> SearchStep { - let old_finger = self.finger_back; - let slice = unsafe { self.haystack.get_unchecked(self.finger..old_finger) }; - let mut iter = slice.chars(); - let old_len = iter.iter.len(); - if let Some(ch) = iter.next_back() { - // subtract byte offset of current character - // without re-encoding as utf-8 - self.finger_back -= old_len - iter.iter.len(); - if ch == self.needle { - SearchStep::Match(self.finger_back, old_finger) - } else { - SearchStep::Reject(self.finger_back, old_finger) - } - } else { - SearchStep::Done - } - } - #[inline] - fn next_match_back(&mut self) -> Option<(usize, usize)> { - let haystack = self.haystack.as_bytes(); - loop { - // get the haystack up to but not including the last character searched - let bytes = if let Some(slice) = haystack.get(self.finger..self.finger_back) { - slice - } else { - return None; - }; - // the last byte of the utf8 encoded needle - let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; - if let Some(index) = memchr::memrchr(last_byte, bytes) { - // we searched a slice that was offset by self.finger, - // add self.finger to recoup the original index - let index = self.finger + index; - // memrchr will return the index of the byte we wish to - // find. In case of an ASCII character, this is indeed - // were we wish our new finger to be ("after" the found - // char in the paradigm of reverse iteration). For - // multibyte chars we need to skip down by the number of more - // bytes they have than ASCII - let shift = self.utf8_size - 1; - if index >= shift { - let found_char = index - shift; - if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) { - if slice == &self.utf8_encoded[0..self.utf8_size] { - // move finger to before the character found (i.e., at its start index) - self.finger_back = found_char; - return Some((self.finger_back, self.finger_back + self.utf8_size)); - } - } - } - // We can't use finger_back = index - size + 1 here. If we found the last char - // of a different-sized character (or the middle byte of a different character) - // we need to bump the finger_back down to `index`. This similarly makes - // `finger_back` have the potential to no longer be on a boundary, - // but this is OK since we only exit this function on a boundary - // or when the haystack has been searched completely. - // - // Unlike next_match this does not - // have the problem of repeated bytes in utf-8 because - // we're searching for the last byte, and we can only have - // found the last byte when searching in reverse. - self.finger_back = index; - } else { - self.finger_back = self.finger; - // found nothing, exit - return None; - } - } - } - - // let next_reject_back use the default implementation from the Searcher trait -} - -impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} - -/// Searches for chars that are equal to a given char -impl<'a> Pattern<'a> for char { - type Searcher = CharSearcher<'a>; - - #[inline] - fn into_searcher(self, haystack: &'a str) -> Self::Searcher { - let mut utf8_encoded = [0; 4]; - let utf8_size = self.encode_utf8(&mut utf8_encoded).len(); - CharSearcher { - haystack, - finger: 0, - finger_back: haystack.len(), - needle: self, - utf8_size, - utf8_encoded - } - } - - #[inline] - fn is_contained_in(self, haystack: &'a str) -> bool { - if (self as u32) < 128 { - haystack.as_bytes().contains(&(self as u8)) - } else { - let mut buffer = [0u8; 4]; - self.encode_utf8(&mut buffer).is_contained_in(haystack) - } - } - - #[inline] - fn is_prefix_of(self, haystack: &'a str) -> bool { - if let Some(ch) = haystack.chars().next() { - self == ch - } else { - false - } - } - - #[inline] - fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a> - { - if let Some(ch) = haystack.chars().next_back() { - self == ch - } else { - false - } - } -} - -///////////////////////////////////////////////////////////////////////////// -// Impl for a MultiCharEq wrapper -///////////////////////////////////////////////////////////////////////////// - -#[doc(hidden)] -trait MultiCharEq { - fn matches(&mut self, c: char) -> bool; -} - -impl<F> MultiCharEq for F where F: FnMut(char) -> bool { - #[inline] - fn matches(&mut self, c: char) -> bool { (*self)(c) } -} - -impl MultiCharEq for &[char] { - #[inline] - fn matches(&mut self, c: char) -> bool { - self.iter().any(|&m| { m == c }) - } -} - -struct MultiCharEqPattern<C: MultiCharEq>(C); - -#[derive(Clone, Debug)] -struct MultiCharEqSearcher<'a, C: MultiCharEq> { - char_eq: C, - haystack: &'a str, - char_indices: super::CharIndices<'a>, -} - -impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern<C> { - type Searcher = MultiCharEqSearcher<'a, C>; - - #[inline] - fn into_searcher(self, haystack: &'a str) -> MultiCharEqSearcher<'a, C> { - MultiCharEqSearcher { - haystack, - char_eq: self.0, - char_indices: haystack.char_indices(), - } - } -} - -unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> { - #[inline] - fn haystack(&self) -> &'a str { - self.haystack - } - - #[inline] - fn next(&mut self) -> SearchStep { - let s = &mut self.char_indices; - // Compare lengths of the internal byte slice iterator - // to find length of current char - let pre_len = s.iter.iter.len(); - if let Some((i, c)) = s.next() { - let len = s.iter.iter.len(); - let char_len = pre_len - len; - if self.char_eq.matches(c) { - return SearchStep::Match(i, i + char_len); - } else { - return SearchStep::Reject(i, i + char_len); - } - } - SearchStep::Done - } -} - -unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> { - #[inline] - fn next_back(&mut self) -> SearchStep { - let s = &mut self.char_indices; - // Compare lengths of the internal byte slice iterator - // to find length of current char - let pre_len = s.iter.iter.len(); - if let Some((i, c)) = s.next_back() { - let len = s.iter.iter.len(); - let char_len = pre_len - len; - if self.char_eq.matches(c) { - return SearchStep::Match(i, i + char_len); - } else { - return SearchStep::Reject(i, i + char_len); - } - } - SearchStep::Done - } -} - -impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {} - -///////////////////////////////////////////////////////////////////////////// - -macro_rules! pattern_methods { - ($t:ty, $pmap:expr, $smap:expr) => { - type Searcher = $t; - - #[inline] - fn into_searcher(self, haystack: &'a str) -> $t { - ($smap)(($pmap)(self).into_searcher(haystack)) - } - - #[inline] - fn is_contained_in(self, haystack: &'a str) -> bool { - ($pmap)(self).is_contained_in(haystack) - } - - #[inline] - fn is_prefix_of(self, haystack: &'a str) -> bool { - ($pmap)(self).is_prefix_of(haystack) - } - - #[inline] - fn is_suffix_of(self, haystack: &'a str) -> bool - where $t: ReverseSearcher<'a> - { - ($pmap)(self).is_suffix_of(haystack) - } - } -} - -macro_rules! searcher_methods { - (forward) => { - #[inline] - fn haystack(&self) -> &'a str { - self.0.haystack() - } - #[inline] - fn next(&mut self) -> SearchStep { - self.0.next() - } - #[inline] - fn next_match(&mut self) -> Option<(usize, usize)> { - self.0.next_match() - } - #[inline] - fn next_reject(&mut self) -> Option<(usize, usize)> { - self.0.next_reject() - } - }; - (reverse) => { - #[inline] - fn next_back(&mut self) -> SearchStep { - self.0.next_back() - } - #[inline] - fn next_match_back(&mut self) -> Option<(usize, usize)> { - self.0.next_match_back() - } - #[inline] - fn next_reject_back(&mut self) -> Option<(usize, usize)> { - self.0.next_reject_back() - } - } -} - -///////////////////////////////////////////////////////////////////////////// -// Impl for &[char] -///////////////////////////////////////////////////////////////////////////// - -// Todo: Change / Remove due to ambiguity in meaning. - -/// Associated type for `<&[char] as Pattern<'a>>::Searcher`. -#[derive(Clone, Debug)] -pub struct CharSliceSearcher<'a, 'b>(<MultiCharEqPattern<&'b [char]> as Pattern<'a>>::Searcher); - -unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> { - searcher_methods!(forward); -} - -unsafe impl<'a, 'b> ReverseSearcher<'a> for CharSliceSearcher<'a, 'b> { - searcher_methods!(reverse); -} - -impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {} - -/// Searches for chars that are equal to any of the chars in the array -impl<'a, 'b> Pattern<'a> for &'b [char] { - pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher); -} - -///////////////////////////////////////////////////////////////////////////// -// Impl for F: FnMut(char) -> bool -///////////////////////////////////////////////////////////////////////////// - -/// Associated type for `<F as Pattern<'a>>::Searcher`. -#[derive(Clone)] -pub struct CharPredicateSearcher<'a, F>(<MultiCharEqPattern<F> as Pattern<'a>>::Searcher) - where F: FnMut(char) -> bool; - -impl<F> fmt::Debug for CharPredicateSearcher<'_, F> - where F: FnMut(char) -> bool -{ - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("CharPredicateSearcher") - .field("haystack", &self.0.haystack) - .field("char_indices", &self.0.char_indices) - .finish() - } -} -unsafe impl<'a, F> Searcher<'a> for CharPredicateSearcher<'a, F> - where F: FnMut(char) -> bool -{ - searcher_methods!(forward); -} - -unsafe impl<'a, F> ReverseSearcher<'a> for CharPredicateSearcher<'a, F> - where F: FnMut(char) -> bool -{ - searcher_methods!(reverse); -} - -impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> - where F: FnMut(char) -> bool {} - -/// Searches for chars that match the given predicate -impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool { - pattern_methods!(CharPredicateSearcher<'a, F>, MultiCharEqPattern, CharPredicateSearcher); -} - -///////////////////////////////////////////////////////////////////////////// -// Impl for &&str -///////////////////////////////////////////////////////////////////////////// - -/// Delegates to the `&str` impl. -impl<'a, 'b, 'c> Pattern<'a> for &'c &'b str { - pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s); -} - -///////////////////////////////////////////////////////////////////////////// -// Impl for &str -///////////////////////////////////////////////////////////////////////////// - -/// Non-allocating substring search. -/// -/// Will handle the pattern `""` as returning empty matches at each character -/// boundary. -impl<'a, 'b> Pattern<'a> for &'b str { - type Searcher = StrSearcher<'a, 'b>; - - #[inline] - fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> { - StrSearcher::new(haystack, self) - } - - /// Checks whether the pattern matches at the front of the haystack - #[inline] - fn is_prefix_of(self, haystack: &'a str) -> bool { - haystack.is_char_boundary(self.len()) && - self == &haystack[..self.len()] - } - - /// Checks whether the pattern matches at the back of the haystack - #[inline] - fn is_suffix_of(self, haystack: &'a str) -> bool { - self.len() <= haystack.len() && - haystack.is_char_boundary(haystack.len() - self.len()) && - self == &haystack[haystack.len() - self.len()..] - } -} - - -///////////////////////////////////////////////////////////////////////////// -// Two Way substring searcher -///////////////////////////////////////////////////////////////////////////// - -#[derive(Clone, Debug)] -/// Associated type for `<&str as Pattern<'a>>::Searcher`. -pub struct StrSearcher<'a, 'b> { - haystack: &'a str, - needle: &'b str, - - searcher: StrSearcherImpl, -} - -#[derive(Clone, Debug)] -enum StrSearcherImpl { - Empty(EmptyNeedle), - TwoWay(TwoWaySearcher), -} - -#[derive(Clone, Debug)] -struct EmptyNeedle { - position: usize, - end: usize, - is_match_fw: bool, - is_match_bw: bool, -} - -impl<'a, 'b> StrSearcher<'a, 'b> { - fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> { - if needle.is_empty() { - StrSearcher { - haystack, - needle, - searcher: StrSearcherImpl::Empty(EmptyNeedle { - position: 0, - end: haystack.len(), - is_match_fw: true, - is_match_bw: true, - }), - } - } else { - StrSearcher { - haystack, - needle, - searcher: StrSearcherImpl::TwoWay( - TwoWaySearcher::new(needle.as_bytes(), haystack.len()) - ), - } - } - } -} - -unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { - #[inline] - fn haystack(&self) -> &'a str { - self.haystack - } - - #[inline] - fn next(&mut self) -> SearchStep { - match self.searcher { - StrSearcherImpl::Empty(ref mut searcher) => { - // empty needle rejects every char and matches every empty string between them - let is_match = searcher.is_match_fw; - searcher.is_match_fw = !searcher.is_match_fw; - let pos = searcher.position; - match self.haystack[pos..].chars().next() { - _ if is_match => SearchStep::Match(pos, pos), - None => SearchStep::Done, - Some(ch) => { - searcher.position += ch.len_utf8(); - SearchStep::Reject(pos, searcher.position) - } - } - } - StrSearcherImpl::TwoWay(ref mut searcher) => { - // TwoWaySearcher produces valid *Match* indices that split at char boundaries - // as long as it does correct matching and that haystack and needle are - // valid UTF-8 - // *Rejects* from the algorithm can fall on any indices, but we will walk them - // manually to the next character boundary, so that they are utf-8 safe. - if searcher.position == self.haystack.len() { - return SearchStep::Done; - } - let is_long = searcher.memory == usize::MAX; - match searcher.next::<RejectAndMatch>(self.haystack.as_bytes(), - self.needle.as_bytes(), - is_long) - { - SearchStep::Reject(a, mut b) => { - // skip to next char boundary - while !self.haystack.is_char_boundary(b) { - b += 1; - } - searcher.position = cmp::max(b, searcher.position); - SearchStep::Reject(a, b) - } - otherwise => otherwise, - } - } - } - } - - #[inline] - fn next_match(&mut self) -> Option<(usize, usize)> { - match self.searcher { - StrSearcherImpl::Empty(..) => { - loop { - match self.next() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - SearchStep::Reject(..) => { } - } - } - } - StrSearcherImpl::TwoWay(ref mut searcher) => { - let is_long = searcher.memory == usize::MAX; - // write out `true` and `false` cases to encourage the compiler - // to specialize the two cases separately. - if is_long { - searcher.next::<MatchOnly>(self.haystack.as_bytes(), - self.needle.as_bytes(), - true) - } else { - searcher.next::<MatchOnly>(self.haystack.as_bytes(), - self.needle.as_bytes(), - false) - } - } - } - } -} - -unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { - #[inline] - fn next_back(&mut self) -> SearchStep { - match self.searcher { - StrSearcherImpl::Empty(ref mut searcher) => { - let is_match = searcher.is_match_bw; - searcher.is_match_bw = !searcher.is_match_bw; - let end = searcher.end; - match self.haystack[..end].chars().next_back() { - _ if is_match => SearchStep::Match(end, end), - None => SearchStep::Done, - Some(ch) => { - searcher.end -= ch.len_utf8(); - SearchStep::Reject(searcher.end, end) - } - } - } - StrSearcherImpl::TwoWay(ref mut searcher) => { - if searcher.end == 0 { - return SearchStep::Done; - } - let is_long = searcher.memory == usize::MAX; - match searcher.next_back::<RejectAndMatch>(self.haystack.as_bytes(), - self.needle.as_bytes(), - is_long) - { - SearchStep::Reject(mut a, b) => { - // skip to next char boundary - while !self.haystack.is_char_boundary(a) { - a -= 1; - } - searcher.end = cmp::min(a, searcher.end); - SearchStep::Reject(a, b) - } - otherwise => otherwise, - } - } - } - } - - #[inline] - fn next_match_back(&mut self) -> Option<(usize, usize)> { - match self.searcher { - StrSearcherImpl::Empty(..) => { - loop { - match self.next_back() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - SearchStep::Reject(..) => { } - } - } - } - StrSearcherImpl::TwoWay(ref mut searcher) => { - let is_long = searcher.memory == usize::MAX; - // write out `true` and `false`, like `next_match` - if is_long { - searcher.next_back::<MatchOnly>(self.haystack.as_bytes(), - self.needle.as_bytes(), - true) - } else { - searcher.next_back::<MatchOnly>(self.haystack.as_bytes(), - self.needle.as_bytes(), - false) - } - } - } - } -} - -/// The internal state of the two-way substring search algorithm. -#[derive(Clone, Debug)] -struct TwoWaySearcher { - // constants - /// critical factorization index - crit_pos: usize, - /// critical factorization index for reversed needle - crit_pos_back: usize, - period: usize, - /// `byteset` is an extension (not part of the two way algorithm); - /// it's a 64-bit "fingerprint" where each set bit `j` corresponds - /// to a (byte & 63) == j present in the needle. - byteset: u64, - - // variables - position: usize, - end: usize, - /// index into needle before which we have already matched - memory: usize, - /// index into needle after which we have already matched - memory_back: usize, -} - -/* - This is the Two-Way search algorithm, which was introduced in the paper: - Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675. - - Here's some background information. - - A *word* is a string of symbols. The *length* of a word should be a familiar - notion, and here we denote it for any word x by |x|. - (We also allow for the possibility of the *empty word*, a word of length zero). - - If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a - *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p]. - For example, both 1 and 2 are periods for the string "aa". As another example, - the only period of the string "abcd" is 4. - - We denote by period(x) the *smallest* period of x (provided that x is non-empty). - This is always well-defined since every non-empty word x has at least one period, - |x|. We sometimes call this *the period* of x. - - If u, v and x are words such that x = uv, where uv is the concatenation of u and - v, then we say that (u, v) is a *factorization* of x. - - Let (u, v) be a factorization for a word x. Then if w is a non-empty word such - that both of the following hold - - - either w is a suffix of u or u is a suffix of w - - either w is a prefix of v or v is a prefix of w - - then w is said to be a *repetition* for the factorization (u, v). - - Just to unpack this, there are four possibilities here. Let w = "abc". Then we - might have: - - - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde") - - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab") - - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi") - - u is a suffix of w and v is a prefix of w. ex: ("bc", "a") - - Note that the word vu is a repetition for any factorization (u,v) of x = uv, - so every factorization has at least one repetition. - - If x is a string and (u, v) is a factorization for x, then a *local period* for - (u, v) is an integer r such that there is some word w such that |w| = r and w is - a repetition for (u, v). - - We denote by local_period(u, v) the smallest local period of (u, v). We sometimes - call this *the local period* of (u, v). Provided that x = uv is non-empty, this - is well-defined (because each non-empty word has at least one factorization, as - noted above). - - It can be proven that the following is an equivalent definition of a local period - for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for - all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are - defined. (i.e., i > 0 and i + r < |x|). - - Using the above reformulation, it is easy to prove that - - 1 <= local_period(u, v) <= period(uv) - - A factorization (u, v) of x such that local_period(u,v) = period(x) is called a - *critical factorization*. - - The algorithm hinges on the following theorem, which is stated without proof: - - **Critical Factorization Theorem** Any word x has at least one critical - factorization (u, v) such that |u| < period(x). - - The purpose of maximal_suffix is to find such a critical factorization. - - If the period is short, compute another factorization x = u' v' to use - for reverse search, chosen instead so that |v'| < period(x). - -*/ -impl TwoWaySearcher { - fn new(needle: &[u8], end: usize) -> TwoWaySearcher { - let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false); - let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true); - - let (crit_pos, period) = - if crit_pos_false > crit_pos_true { - (crit_pos_false, period_false) - } else { - (crit_pos_true, period_true) - }; - - // A particularly readable explanation of what's going on here can be found - // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically - // see the code for "Algorithm CP" on p. 323. - // - // What's going on is we have some critical factorization (u, v) of the - // needle, and we want to determine whether u is a suffix of - // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use - // "Algorithm CP2", which is optimized for when the period of the needle - // is large. - if &needle[..crit_pos] == &needle[period.. period + crit_pos] { - // short period case -- the period is exact - // compute a separate critical factorization for the reversed needle - // x = u' v' where |v'| < period(x). - // - // This is sped up by the period being known already. - // Note that a case like x = "acba" may be factored exactly forwards - // (crit_pos = 1, period = 3) while being factored with approximate - // period in reverse (crit_pos = 2, period = 2). We use the given - // reverse factorization but keep the exact period. - let crit_pos_back = needle.len() - cmp::max( - TwoWaySearcher::reverse_maximal_suffix(needle, period, false), - TwoWaySearcher::reverse_maximal_suffix(needle, period, true)); - - TwoWaySearcher { - crit_pos, - crit_pos_back, - period, - byteset: Self::byteset_create(&needle[..period]), - - position: 0, - end, - memory: 0, - memory_back: needle.len(), - } - } else { - // long period case -- we have an approximation to the actual period, - // and don't use memorization. - // - // Approximate the period by lower bound max(|u|, |v|) + 1. - // The critical factorization is efficient to use for both forward and - // reverse search. - - TwoWaySearcher { - crit_pos, - crit_pos_back: crit_pos, - period: cmp::max(crit_pos, needle.len() - crit_pos) + 1, - byteset: Self::byteset_create(needle), - - position: 0, - end, - memory: usize::MAX, // Dummy value to signify that the period is long - memory_back: usize::MAX, - } - } - } - - #[inline] - fn byteset_create(bytes: &[u8]) -> u64 { - bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a) - } - - #[inline] - fn byteset_contains(&self, byte: u8) -> bool { - (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0 - } - - // One of the main ideas of Two-Way is that we factorize the needle into - // two halves, (u, v), and begin trying to find v in the haystack by scanning - // left to right. If v matches, we try to match u by scanning right to left. - // How far we can jump when we encounter a mismatch is all based on the fact - // that (u, v) is a critical factorization for the needle. - #[inline] - fn next<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) - -> S::Output - where S: TwoWayStrategy - { - // `next()` uses `self.position` as its cursor - let old_pos = self.position; - let needle_last = needle.len() - 1; - 'search: loop { - // Check that we have room to search in - // position + needle_last can not overflow if we assume slices - // are bounded by isize's range. - let tail_byte = match haystack.get(self.position + needle_last) { - Some(&b) => b, - None => { - self.position = haystack.len(); - return S::rejecting(old_pos, self.position); - } - }; - - if S::use_early_reject() && old_pos != self.position { - return S::rejecting(old_pos, self.position); - } - - // Quickly skip by large portions unrelated to our substring - if !self.byteset_contains(tail_byte) { - self.position += needle.len(); - if !long_period { - self.memory = 0; - } - continue 'search; - } - - // See if the right part of the needle matches - let start = if long_period { self.crit_pos } - else { cmp::max(self.crit_pos, self.memory) }; - for i in start..needle.len() { - if needle[i] != haystack[self.position + i] { - self.position += i - self.crit_pos + 1; - if !long_period { - self.memory = 0; - } - continue 'search; - } - } - - // See if the left part of the needle matches - let start = if long_period { 0 } else { self.memory }; - for i in (start..self.crit_pos).rev() { - if needle[i] != haystack[self.position + i] { - self.position += self.period; - if !long_period { - self.memory = needle.len() - self.period; - } - continue 'search; - } - } - - // We have found a match! - let match_pos = self.position; - - // Note: add self.period instead of needle.len() to have overlapping matches - self.position += needle.len(); - if !long_period { - self.memory = 0; // set to needle.len() - self.period for overlapping matches - } - - return S::matching(match_pos, match_pos + needle.len()); - } - } - - // Follows the ideas in `next()`. - // - // The definitions are symmetrical, with period(x) = period(reverse(x)) - // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v) - // is a critical factorization, so is (reverse(v), reverse(u)). - // - // For the reverse case we have computed a critical factorization x = u' v' - // (field `crit_pos_back`). We need |u| < period(x) for the forward case and - // thus |v'| < period(x) for the reverse. - // - // To search in reverse through the haystack, we search forward through - // a reversed haystack with a reversed needle, matching first u' and then v'. - #[inline] - fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) - -> S::Output - where S: TwoWayStrategy - { - // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()` - // are independent. - let old_end = self.end; - 'search: loop { - // Check that we have room to search in - // end - needle.len() will wrap around when there is no more room, - // but due to slice length limits it can never wrap all the way back - // into the length of haystack. - let front_byte = match haystack.get(self.end.wrapping_sub(needle.len())) { - Some(&b) => b, - None => { - self.end = 0; - return S::rejecting(0, old_end); - } - }; - - if S::use_early_reject() && old_end != self.end { - return S::rejecting(self.end, old_end); - } - - // Quickly skip by large portions unrelated to our substring - if !self.byteset_contains(front_byte) { - self.end -= needle.len(); - if !long_period { - self.memory_back = needle.len(); - } - continue 'search; - } - - // See if the left part of the needle matches - let crit = if long_period { self.crit_pos_back } - else { cmp::min(self.crit_pos_back, self.memory_back) }; - for i in (0..crit).rev() { - if needle[i] != haystack[self.end - needle.len() + i] { - self.end -= self.crit_pos_back - i; - if !long_period { - self.memory_back = needle.len(); - } - continue 'search; - } - } - - // See if the right part of the needle matches - let needle_end = if long_period { needle.len() } - else { self.memory_back }; - for i in self.crit_pos_back..needle_end { - if needle[i] != haystack[self.end - needle.len() + i] { - self.end -= self.period; - if !long_period { - self.memory_back = self.period; - } - continue 'search; - } - } - - // We have found a match! - let match_pos = self.end - needle.len(); - // Note: sub self.period instead of needle.len() to have overlapping matches - self.end -= needle.len(); - if !long_period { - self.memory_back = needle.len(); - } - - return S::matching(match_pos, match_pos + needle.len()); - } - } - - // Compute the maximal suffix of `arr`. - // - // The maximal suffix is a possible critical factorization (u, v) of `arr`. - // - // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the - // period of v. - // - // `order_greater` determines if lexical order is `<` or `>`. Both - // orders must be computed -- the ordering with the largest `i` gives - // a critical factorization. - // - // For long period cases, the resulting period is not exact (it is too short). - #[inline] - fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) { - let mut left = 0; // Corresponds to i in the paper - let mut right = 1; // Corresponds to j in the paper - let mut offset = 0; // Corresponds to k in the paper, but starting at 0 - // to match 0-based indexing. - let mut period = 1; // Corresponds to p in the paper - - while let Some(&a) = arr.get(right + offset) { - // `left` will be inbounds when `right` is. - let b = arr[left + offset]; - if (a < b && !order_greater) || (a > b && order_greater) { - // Suffix is smaller, period is entire prefix so far. - right += offset + 1; - offset = 0; - period = right - left; - } else if a == b { - // Advance through repetition of the current period. - if offset + 1 == period { - right += offset + 1; - offset = 0; - } else { - offset += 1; - } - } else { - // Suffix is larger, start over from current location. - left = right; - right += 1; - offset = 0; - period = 1; - } - } - (left, period) - } - - // Compute the maximal suffix of the reverse of `arr`. - // - // The maximal suffix is a possible critical factorization (u', v') of `arr`. - // - // Returns `i` where `i` is the starting index of v', from the back; - // returns immediately when a period of `known_period` is reached. - // - // `order_greater` determines if lexical order is `<` or `>`. Both - // orders must be computed -- the ordering with the largest `i` gives - // a critical factorization. - // - // For long period cases, the resulting period is not exact (it is too short). - fn reverse_maximal_suffix(arr: &[u8], known_period: usize, - order_greater: bool) -> usize - { - let mut left = 0; // Corresponds to i in the paper - let mut right = 1; // Corresponds to j in the paper - let mut offset = 0; // Corresponds to k in the paper, but starting at 0 - // to match 0-based indexing. - let mut period = 1; // Corresponds to p in the paper - let n = arr.len(); - - while right + offset < n { - let a = arr[n - (1 + right + offset)]; - let b = arr[n - (1 + left + offset)]; - if (a < b && !order_greater) || (a > b && order_greater) { - // Suffix is smaller, period is entire prefix so far. - right += offset + 1; - offset = 0; - period = right - left; - } else if a == b { - // Advance through repetition of the current period. - if offset + 1 == period { - right += offset + 1; - offset = 0; - } else { - offset += 1; - } - } else { - // Suffix is larger, start over from current location. - left = right; - right += 1; - offset = 0; - period = 1; - } - if period == known_period { - break; - } - } - debug_assert!(period <= known_period); - left - } -} - -// TwoWayStrategy allows the algorithm to either skip non-matches as quickly -// as possible, or to work in a mode where it emits Rejects relatively quickly. -trait TwoWayStrategy { - type Output; - fn use_early_reject() -> bool; - fn rejecting(a: usize, b: usize) -> Self::Output; - fn matching(a: usize, b: usize) -> Self::Output; -} - -/// Skip to match intervals as quickly as possible -enum MatchOnly { } - -impl TwoWayStrategy for MatchOnly { - type Output = Option<(usize, usize)>; - - #[inline] - fn use_early_reject() -> bool { false } - #[inline] - fn rejecting(_a: usize, _b: usize) -> Self::Output { None } - #[inline] - fn matching(a: usize, b: usize) -> Self::Output { Some((a, b)) } -} - -/// Emit Rejects regularly -enum RejectAndMatch { } - -impl TwoWayStrategy for RejectAndMatch { - type Output = SearchStep; - - #[inline] - fn use_early_reject() -> bool { true } - #[inline] - fn rejecting(a: usize, b: usize) -> Self::Output { SearchStep::Reject(a, b) } - #[inline] - fn matching(a: usize, b: usize) -> Self::Output { SearchStep::Match(a, b) } -} diff --git a/src/libcore/tests/lib.rs b/src/libcore/tests/lib.rs index fcdeb57f482a9..4a177a502499b 100644 --- a/src/libcore/tests/lib.rs +++ b/src/libcore/tests/lib.rs @@ -13,7 +13,7 @@ #![feature(iter_copied)] #![feature(iter_nth_back)] #![feature(iter_once_with)] -#![feature(pattern)] +#![feature(needle)] #![feature(range_is_empty)] #![feature(raw)] #![feature(slice_patterns)] diff --git a/src/libcore/tests/pattern.rs b/src/libcore/tests/pattern.rs index b78ed0210770f..5027f7088f6e7 100644 --- a/src/libcore/tests/pattern.rs +++ b/src/libcore/tests/pattern.rs @@ -1,109 +1,61 @@ -use std::str::pattern::*; +use core::needle::*; // This macro makes it easier to write // tests that do a series of iterations macro_rules! search_asserts { - ($haystack:expr, $needle:expr, $testname:expr, [$($func:ident),*], $result:expr) => { - let mut searcher = $needle.into_searcher($haystack); - let arr = [$( Step::from(searcher.$func()) ),+]; - assert_eq!(&arr[..], &$result, $testname); - } + ($haystack:expr, $needle:expr, $testname:expr, [$($op:ident = $expected:expr,)*]) => { + let mut searcher = ext::match_ranges($haystack, $needle).map(|(r, _)| r); + let actual = [$(searcher.$op()),*]; + assert_eq!(&actual[..], &[$($expected),*][..], $testname); + }; } -/// Combined enum for the results of next() and next_match()/next_reject() -#[derive(Debug, PartialEq, Eq)] -enum Step { - // variant names purposely chosen to - // be the same length for easy alignment - Matches(usize, usize), - Rejects(usize, usize), - InRange(usize, usize), - Done -} - -use self::Step::*; - -impl From<SearchStep> for Step { - fn from(x: SearchStep) -> Self { - match x { - SearchStep::Match(a, b) => Matches(a, b), - SearchStep::Reject(a, b) => Rejects(a, b), - SearchStep::Done => Done - } - } -} - -impl From<Option<(usize, usize)>> for Step { - fn from(x: Option<(usize, usize)>) -> Self { - match x { - Some((a, b)) => InRange(a, b), - None => Done - } - } -} - -// ignore-tidy-linelength - -// FIXME(Manishearth) these tests focus on single-character searching (CharSearcher) -// and on next()/next_match(), not next_reject(). This is because -// the memchr changes make next_match() for single chars complex, but next_reject() -// continues to use next() under the hood. We should add more test cases for all -// of these, as well as tests for StrSearcher and higher level tests for str::find() (etc) - #[test] fn test_simple_iteration() { - search_asserts! ("abcdeabcd", 'a', "forward iteration for ASCII string", - // a b c d e a b c d EOF - [next, next, next, next, next, next, next, next, next, next], - [Matches(0, 1), Rejects(1, 2), Rejects(2, 3), Rejects(3, 4), Rejects(4, 5), Matches(5, 6), Rejects(6, 7), Rejects(7, 8), Rejects(8, 9), Done] - ); - - search_asserts! ("abcdeabcd", 'a', "reverse iteration for ASCII string", - // d c b a e d c b a EOF - [next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back], - [Rejects(8, 9), Rejects(7, 8), Rejects(6, 7), Matches(5, 6), Rejects(4, 5), Rejects(3, 4), Rejects(2, 3), Rejects(1, 2), Matches(0, 1), Done] - ); - - search_asserts! ("我爱我的猫", '我', "forward iteration for Chinese string", - // 我 愛 我 的 貓 EOF - [next, next, next, next, next, next], - [Matches(0, 3), Rejects(3, 6), Matches(6, 9), Rejects(9, 12), Rejects(12, 15), Done] - ); - - search_asserts! ("我的猫说meow", 'm', "forward iteration for mixed string", - // 我 的 猫 说 m e o w EOF - [next, next, next, next, next, next, next, next, next], - [Rejects(0, 3), Rejects(3, 6), Rejects(6, 9), Rejects(9, 12), Matches(12, 13), Rejects(13, 14), Rejects(14, 15), Rejects(15, 16), Done] - ); - - search_asserts! ("我的猫说meow", '猫', "reverse iteration for mixed string", - // w o e m 说 猫 的 我 EOF - [next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back], - [Rejects(15, 16), Rejects(14, 15), Rejects(13, 14), Rejects(12, 13), Rejects(9, 12), Matches(6, 9), Rejects(3, 6), Rejects(0, 3), Done] - ); + search_asserts! ("abcdeabcd", 'a', "forward iteration for ASCII string", [ + next = Some(0..1), + next = Some(5..6), + next = None, + ]); + + search_asserts! ("abcdeabcd", 'a', "reverse iteration for ASCII string", [ + next_back = Some(5..6), + next_back = Some(0..1), + next_back = None, + ]); + + search_asserts! ("我爱我的猫", '我', "forward iteration for Chinese string", [ + next = Some(0..3), + next = Some(6..9), + next = None, + ]); + + search_asserts! ("我的猫说meow", 'm', "forward iteration for mixed string", [ + next = Some(12..13), + next = None, + ]); + + search_asserts! ("我的猫说meow", '猫', "reverse iteration for mixed string", [ + next_back = Some(6..9), + next_back = None, + ]); } #[test] fn test_simple_search() { - search_asserts!("abcdeabcdeabcde", 'a', "next_match for ASCII string", - [next_match, next_match, next_match, next_match], - [InRange(0, 1), InRange(5, 6), InRange(10, 11), Done] - ); - - search_asserts!("abcdeabcdeabcde", 'a', "next_match_back for ASCII string", - [next_match_back, next_match_back, next_match_back, next_match_back], - [InRange(10, 11), InRange(5, 6), InRange(0, 1), Done] - ); - - search_asserts!("abcdeab", 'a', "next_reject for ASCII string", - [next_reject, next_reject, next_match, next_reject, next_reject], - [InRange(1, 2), InRange(2, 3), InRange(5, 6), InRange(6, 7), Done] - ); - - search_asserts!("abcdeabcdeabcde", 'a', "next_reject_back for ASCII string", - [next_reject_back, next_reject_back, next_match_back, next_reject_back, next_reject_back, next_reject_back], - [InRange(14, 15), InRange(13, 14), InRange(10, 11), InRange(9, 10), InRange(8, 9), InRange(7, 8)] - ); + search_asserts!("abcdeabcdeabcde", 'a', "next_match for ASCII string", [ + next = Some(0..1), + next = Some(5..6), + next = Some(10..11), + next = None, + ]); + + search_asserts!("abcdeabcdeabcde", 'a', "next_match_back for ASCII string", [ + next_back = Some(10..11), + next_back = Some(5..6), + next_back = Some(0..1), + next_back = None, + ]); } // Á, 각, ก, 😀 all end in 0x81 @@ -117,176 +69,243 @@ fn test_simple_search() { // So we test if next() is correct after each next_match() as well. const STRESS: &str = "Áa🁀bÁꁁfg😁각กᘀ각aÁ각ꁁก😁a"; -#[test] -fn test_stress_indices() { - // this isn't really a test, more of documentation on the indices of each character in the stresstest string - - search_asserts!(STRESS, 'x', "Indices of characters in stress test", - [next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next], - [Rejects(0, 2), // Á - Rejects(2, 3), // a - Rejects(3, 7), // 🁀 - Rejects(7, 8), // b - Rejects(8, 10), // Á - Rejects(10, 13), // ꁁ - Rejects(13, 14), // f - Rejects(14, 15), // g - Rejects(15, 19), // 😀 - Rejects(19, 22), // 각 - Rejects(22, 25), // ก - Rejects(25, 28), // ᘀ - Rejects(28, 31), // 각 - Rejects(31, 32), // a - Rejects(32, 34), // Á - Rejects(34, 37), // 각 - Rejects(37, 40), // ꁁ - Rejects(40, 43), // ก - Rejects(43, 47), // 😀 - Rejects(47, 48), // a - Done] - ); -} - #[test] fn test_forward_search_shared_bytes() { - search_asserts!(STRESS, 'Á', "Forward search for two-byte Latin character", - [next_match, next_match, next_match, next_match], - [InRange(0, 2), InRange(8, 10), InRange(32, 34), Done] - ); - - search_asserts!(STRESS, 'Á', "Forward search for two-byte Latin character; check if next() still works", - [next_match, next, next_match, next, next_match, next, next_match], - [InRange(0, 2), Rejects(2, 3), InRange(8, 10), Rejects(10, 13), InRange(32, 34), Rejects(34, 37), Done] - ); - - search_asserts!(STRESS, '각', "Forward search for three-byte Hangul character", - [next_match, next, next_match, next_match, next_match], - [InRange(19, 22), Rejects(22, 25), InRange(28, 31), InRange(34, 37), Done] - ); - - search_asserts!(STRESS, '각', "Forward search for three-byte Hangul character; check if next() still works", - [next_match, next, next_match, next, next_match, next, next_match], - [InRange(19, 22), Rejects(22, 25), InRange(28, 31), Rejects(31, 32), InRange(34, 37), Rejects(37, 40), Done] - ); - - search_asserts!(STRESS, 'ก', "Forward search for three-byte Thai character", - [next_match, next, next_match, next, next_match], - [InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done] - ); - - search_asserts!(STRESS, 'ก', "Forward search for three-byte Thai character; check if next() still works", - [next_match, next, next_match, next, next_match], - [InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done] - ); - - search_asserts!(STRESS, '😁', "Forward search for four-byte emoji", - [next_match, next, next_match, next, next_match], - [InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done] - ); - - search_asserts!(STRESS, '😁', "Forward search for four-byte emoji; check if next() still works", - [next_match, next, next_match, next, next_match], - [InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done] - ); - - search_asserts!(STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes", - [next_match, next, next_match, next, next_match], - [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done] - ); - - search_asserts!(STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes; check if next() still works", - [next_match, next, next_match, next, next_match], - [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done] - ); + search_asserts!(STRESS, 'Á', "Forward search for two-byte Latin character", [ + next = Some(0..2), + next = Some(8..10), + next = Some(32..34), + next = None, + ]); + + search_asserts!(STRESS, '각', "Forward search for three-byte Hangul character", [ + next = Some(19..22), + next = Some(28..31), + next = Some(34..37), + next = None, + ]); + + search_asserts!(STRESS, 'ก', "Forward search for three-byte Thai character", [ + next = Some(22..25), + next = Some(40..43), + next = None, + ]); + + search_asserts!(STRESS, '😁', "Forward search for four-byte emoji", [ + next = Some(15..19), + next = Some(43..47), + next = None, + ]); + + search_asserts!(STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes", [ + next = Some(10..13), + next = Some(37..40), + next = None, + ]); } #[test] fn test_reverse_search_shared_bytes() { - search_asserts!(STRESS, 'Á', "Reverse search for two-byte Latin character", - [next_match_back, next_match_back, next_match_back, next_match_back], - [InRange(32, 34), InRange(8, 10), InRange(0, 2), Done] - ); - - search_asserts!(STRESS, 'Á', "Reverse search for two-byte Latin character; check if next_back() still works", - [next_match_back, next_back, next_match_back, next_back, next_match_back, next_back], - [InRange(32, 34), Rejects(31, 32), InRange(8, 10), Rejects(7, 8), InRange(0, 2), Done] - ); - - search_asserts!(STRESS, '각', "Reverse search for three-byte Hangul character", - [next_match_back, next_back, next_match_back, next_match_back, next_match_back], - [InRange(34, 37), Rejects(32, 34), InRange(28, 31), InRange(19, 22), Done] - ); - - search_asserts!(STRESS, '각', "Reverse search for three-byte Hangul character; check if next_back() still works", - [next_match_back, next_back, next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(34, 37), Rejects(32, 34), InRange(28, 31), Rejects(25, 28), InRange(19, 22), Rejects(15, 19), Done] - ); - - search_asserts!(STRESS, 'ก', "Reverse search for three-byte Thai character", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done] - ); - - search_asserts!(STRESS, 'ก', "Reverse search for three-byte Thai character; check if next_back() still works", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done] - ); - - search_asserts!(STRESS, '😁', "Reverse search for four-byte emoji", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done] - ); + search_asserts!(STRESS, 'Á', "Reverse search for two-byte Latin character", [ + next_back = Some(32..34), + next_back = Some(8..10), + next_back = Some(0..2), + next_back = None, + ]); + + search_asserts!(STRESS, '각', "Reverse search for three-byte Hangul character", [ + next_back = Some(34..37), + next_back = Some(28..31), + next_back = Some(19..22), + next_back = None, + ]); + + search_asserts!(STRESS, 'ก', "Reverse search for three-byte Thai character", [ + next_back = Some(40..43), + next_back = Some(22..25), + next_back = None, + ]); + + search_asserts!(STRESS, '😁', "Reverse search for four-byte emoji", [ + next_back = Some(43..47), + next_back = Some(15..19), + next_back = None, + ]); + + search_asserts!(STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes", [ + next_back = Some(37..40), + next_back = Some(10..13), + next_back = None, + ]); +} - search_asserts!(STRESS, '😁', "Reverse search for four-byte emoji; check if next_back() still works", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done] - ); +#[test] +fn double_ended_regression_test() { + search_asserts!("abcdeabcdeabcde", 'a', "alternating double ended search", [ + next = Some(0..1), + next_back = Some(10..11), + next = Some(5..6), + next_back = None, + ]); + + search_asserts!("abcdeabcdeabcde", 'a', "triple double ended search for a", [ + next = Some(0..1), + next_back = Some(10..11), + next_back = Some(5..6), + next_back = None, + ]); + + search_asserts!("abcdeabcdeabcde", 'd', "triple double ended search for d", [ + next = Some(3..4), + next_back = Some(13..14), + next_back = Some(8..9), + next_back = None, + ]); + + search_asserts!(STRESS, 'Á', "Double ended search for two-byte Latin character", [ + next = Some(0..2), + next_back = Some(32..34), + next = Some(8..10), + next_back = None, + ]); + + search_asserts!(STRESS, '각', "Reverse double ended search for three-byte Hangul character", [ + next_back = Some(34..37), + next = Some(19..22), + next_back = Some(28..31), + next = None, + ]); + + search_asserts!(STRESS, 'ก', "Double ended search for three-byte Thai character", [ + next = Some(22..25), + next_back = Some(40..43), + next = None, + ]); + + search_asserts!(STRESS, '😁', "Double ended search for four-byte emoji", [ + next_back = Some(43..47), + next = Some(15..19), + next = None, + ]); + + search_asserts!(STRESS, 'ꁁ', "Double ended search for 3-byte Yi char with repeated bytes", [ + next = Some(10..13), + next_back = Some(37..40), + next = None, + ]); +} - search_asserts!(STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done] - ); +#[test] +fn test_stress_indices() { + // this isn't really a test, + // more of documentation on the indices of each character in the stresstest string + + search_asserts!(STRESS, |_: char| true, "Indices of characters in stress test", [ + next = Some(0..2), // Á + next = Some(2..3), // a + next = Some(3..7), // 🁀 + next = Some(7..8), // b + next = Some(8..10), // Á + next = Some(10..13), // ꁁ + next = Some(13..14), // f + next = Some(14..15), // g + next = Some(15..19), // 😀 + next = Some(19..22), // 각 + next = Some(22..25), // ก + next = Some(25..28), // ᘀ + next = Some(28..31), // 각 + next = Some(31..32), // a + next = Some(32..34), // Á + next = Some(34..37), // 각 + next = Some(37..40), // ꁁ + next = Some(40..43), // ก + next = Some(43..47), // 😀 + next = Some(47..48), // a + next = None, + ]); + + search_asserts!(STRESS, |_: char| true, "Indices of characters in stress test, reversed", [ + next_back = Some(47..48), // a + next_back = Some(43..47), // 😀 + next_back = Some(40..43), // ก + next_back = Some(37..40), // ꁁ + next_back = Some(34..37), // 각 + next_back = Some(32..34), // Á + next_back = Some(31..32), // a + next_back = Some(28..31), // 각 + next_back = Some(25..28), // ᘀ + next_back = Some(22..25), // ก + next_back = Some(19..22), // 각 + next_back = Some(15..19), // 😀 + next_back = Some(14..15), // g + next_back = Some(13..14), // f + next_back = Some(10..13), // ꁁ + next_back = Some(8..10), // Á + next_back = Some(7..8), // b + next_back = Some(3..7), // 🁀 + next_back = Some(2..3), // a + next_back = Some(0..2), // Á + next_back = None, + ]); +} - search_asserts!(STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes; check if next_back() still works", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done] +#[test] +fn test_fn_double_ended() { + search_asserts!( + STRESS, + |c: char| c >= '\u{10000}', + "Search for all non-BMP characters, double ended", + [ + next = Some(3..7), + next_back = Some(43..47), + next = Some(15..19), + next_back = None, + next = None, + ] ); } #[test] -fn double_ended_regression_test() { - // https://github.com/rust-lang/rust/issues/47175 - // Ensures that double ended searching comes to a convergence - search_asserts!("abcdeabcdeabcde", 'a', "alternating double ended search", - [next_match, next_match_back, next_match, next_match_back], - [InRange(0, 1), InRange(10, 11), InRange(5, 6), Done] - ); - search_asserts!("abcdeabcdeabcde", 'a', "triple double ended search for a", - [next_match, next_match_back, next_match_back, next_match_back], - [InRange(0, 1), InRange(10, 11), InRange(5, 6), Done] - ); - search_asserts!("abcdeabcdeabcde", 'd', "triple double ended search for d", - [next_match, next_match_back, next_match_back, next_match_back], - [InRange(3, 4), InRange(13, 14), InRange(8, 9), Done] - ); - search_asserts!(STRESS, 'Á', "Double ended search for two-byte Latin character", - [next_match, next_match_back, next_match, next_match_back], - [InRange(0, 2), InRange(32, 34), InRange(8, 10), Done] - ); - search_asserts!(STRESS, '각', "Reverse double ended search for three-byte Hangul character", - [next_match_back, next_back, next_match, next, next_match_back, next_match], - [InRange(34, 37), Rejects(32, 34), InRange(19, 22), Rejects(22, 25), InRange(28, 31), Done] - ); - search_asserts!(STRESS, 'ก', "Double ended search for three-byte Thai character", - [next_match, next_back, next, next_match_back, next_match], - [InRange(22, 25), Rejects(47, 48), Rejects(25, 28), InRange(40, 43), Done] - ); - search_asserts!(STRESS, '😁', "Double ended search for four-byte emoji", - [next_match_back, next, next_match, next_back, next_match], - [InRange(43, 47), Rejects(0, 2), InRange(15, 19), Rejects(40, 43), Done] - ); - search_asserts!(STRESS, 'ꁁ', "Double ended search for three-byte Yi character with repeated bytes", - [next_match, next, next_match_back, next_back, next_match], - [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(34, 37), Done] - ); +fn test_str() { + search_asserts!("abbcbbd", "bb", "str_searcher_ascii_haystack::fwd", [ + next = Some(1..3), + next = Some(4..6), + next = None, + ]); + + search_asserts!("abbcbbbbd", "bb", "str_searcher_ascii_haystack_seq::fwd", [ + next = Some(1..3), + next = Some(4..6), + next = Some(6..8), + next = None, + ]); + + search_asserts!("abbcbbd", "", "str_searcher_empty_needle_ascii_haystack::fwd", [ + next = Some(0..0), + next = Some(1..1), + next = Some(2..2), + next = Some(3..3), + next = Some(4..4), + next = Some(5..5), + next = Some(6..6), + next = Some(7..7), + next = None, + ]); + + search_asserts!("├──", " ", "str_searcher_multibyte_haystack::fwd", [ + next = None, + ]); + + search_asserts!("├──", "", "str_searcher_empty_needle_multibyte_haystack::fwd", [ + next = Some(0..0), + next = Some(3..3), + next = Some(6..6), + next = Some(9..9), + next = None, + ]); + + search_asserts!("", "", "str_searcher_empty_needle_multibyte_haystack::fwd", [ + next = Some(0..0), + next = None, + ]); } diff --git a/src/libcore/tests/slice.rs b/src/libcore/tests/slice.rs index acf6b03791f01..46afb3787fa86 100644 --- a/src/libcore/tests/slice.rs +++ b/src/libcore/tests/slice.rs @@ -1475,3 +1475,10 @@ fn test_is_sorted() { assert!(!["c", "bb", "aaa"].is_sorted()); assert!(["c", "bb", "aaa"].is_sorted_by_key(|s| s.len())); } + +#[test] +fn test_double_array_ref_is_a_needle() { + static MAGIC_NUMBERS: [&[u8]; 2] = [b"%PDF", b"\x89PNG"]; + let buffer: &[u8] = b"%PDF123"; + assert!(MAGIC_NUMBERS.iter().any(|magic| buffer.starts_with(magic))); +} diff --git a/src/libstd/ffi/os_str.rs b/src/libstd/ffi/os_str.rs index 13aee783750f1..577ec9e687472 100644 --- a/src/libstd/ffi/os_str.rs +++ b/src/libstd/ffi/os_str.rs @@ -5,10 +5,16 @@ use crate::cmp; use crate::hash::{Hash, Hasher}; use crate::rc::Rc; use crate::sync::Arc; +use crate::needle::{ + ext, Hay, Haystack, Needle, Span, Searcher, ReverseSearcher, + Consumer, ReverseConsumer, DoubleEndedConsumer, +}; -use crate::sys::os_str::{Buf, Slice}; +use crate::sys::os_str::{Buf, InnerSearcher, Slice}; use crate::sys_common::{AsInner, IntoInner, FromInner}; +use core::slice::needles::{TwoWaySearcher, SliceSearcher, NaiveSearcher}; + /// A type that can represent owned, mutable platform-native strings, but is /// cheaply inter-convertible with Rust strings. /// @@ -373,6 +379,36 @@ impl ops::Index<ops::RangeFull> for OsString { } } +#[stable(feature = "os_str_slice", since = "1.36.0")] +impl ops::Index<ops::Range<usize>> for OsString { + type Output = OsStr; + + #[inline] + fn index(&self, index: ops::Range<usize>) -> &OsStr { + OsStr::from_inner(&self.inner.as_slice()[index]) + } +} + +#[stable(feature = "os_str_slice", since = "1.36.0")] +impl ops::Index<ops::RangeFrom<usize>> for OsString { + type Output = OsStr; + + #[inline] + fn index(&self, index: ops::RangeFrom<usize>) -> &OsStr { + OsStr::from_inner(&self.inner.as_slice()[index]) + } +} + +#[stable(feature = "os_str_slice", since = "1.36.0")] +impl ops::Index<ops::RangeTo<usize>> for OsString { + type Output = OsStr; + + #[inline] + fn index(&self, index: ops::RangeTo<usize>) -> &OsStr { + OsStr::from_inner(&self.inner.as_slice()[index]) + } +} + #[stable(feature = "rust1", since = "1.0.0")] impl ops::Deref for OsString { type Target = OsStr; @@ -646,12 +682,324 @@ impl OsStr { OsString { inner: Buf::from_box(boxed) } } - /// Gets the underlying byte representation. + /// Returns `true` if the given needle matches a prefix of this `OsStr`. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn starts_with<'a, P>(&'a self, needle: P) -> bool + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::starts_with(self, needle) + } + + /// Returns `true` if the given needle matches a suffix of this `OsStr`. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn ends_with<'a, P>(&'a self, needle: P) -> bool + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: ReverseConsumer<OsStr>, + { + ext::ends_with(self, needle) + } + + /// Returns `true` if the given needle matches a sub-slice of this `OsStr`. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn contains<'a, P>(&'a self, needle: P) -> bool + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::contains(self, needle) + } + + /// Returns the start index of first slice of this `OsStr` that matches the + /// needle. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn find<'a, P>(&'a self, needle: P) -> Option<usize> + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::find(self, needle) + } + + /// Returns the start index of last slice of this `OsStr` that matches the + /// needle. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn rfind<'a, P>(&'a self, needle: P) -> Option<usize> + where + P: Needle<&'a OsStr>, + P::Searcher: ReverseSearcher<OsStr>, + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::rfind(self, needle) + } + + /// Returns the index range of first slice of this `OsStr` that matches the + /// needle. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn find_range<'a, P>(&'a self, needle: P) -> Option<ops::Range<usize>> + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::find_range(self, needle) + } + + /// Returns the start index of last slice of this `OsStr` that matches the + /// needle. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn rfind_range<'a, P>(&'a self, needle: P) -> Option<ops::Range<usize>> + where + P: Needle<&'a OsStr>, + P::Searcher: ReverseSearcher<OsStr>, + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::rfind_range(self, needle) + } + + /// Returns an `OsStr` slice with all prefixes that match the needle + /// repeatedly removed. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn trim_start_matches<'a, P>(&'a self, needle: P) -> &'a OsStr + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::trim_start(self, needle) + } + + /// Returns an `OsStr` slice with all suffixes that match the needle + /// repeatedly removed. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn trim_end_matches<'a, P>(&'a self, needle: P) -> &'a OsStr + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: ReverseConsumer<OsStr>, + { + ext::trim_end(self, needle) + } + + /// Returns an `OsStr` slice with all prefixes and suffixes that match the + /// needle repeatedly removed. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn trim_matches<'a, P>(&'a self, needle: P) -> &'a OsStr + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: DoubleEndedConsumer<OsStr>, + { + ext::trim(self, needle) + } + + /// An iterator over the disjoint matches of the needle within the given + /// `OsStr`. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn matches<'a, P>(&'a self, needle: P) -> ext::Matches<&'a OsStr, P::Searcher> + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::matches(self, needle) + } + + /// An iterator over the disjoint matches of the needle within the given + /// `OsStr`, yielded in reverse order. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn rmatches<'a, P>(&'a self, needle: P) -> ext::RMatches<&'a OsStr, P::Searcher> + where + P: Needle<&'a OsStr>, + P::Searcher: ReverseSearcher<OsStr>, + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::rmatches(self, needle) + } + + /// An iterator over the disjoint matches of a needle within this `OsStr` + /// as well as the index that the match starts at. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn match_indices<'a, P>(&'a self, needle: P) -> ext::MatchIndices<&'a OsStr, P::Searcher> + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::match_indices(self, needle) + } + + /// An iterator over the disjoint matches of a needle within this `OsStr`, + /// yielded in reverse order along with the index of the match. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn rmatch_indices<'a, P>(&'a self, needle: P) -> ext::RMatchIndices<&'a OsStr, P::Searcher> + where + P: Needle<&'a OsStr>, + P::Searcher: ReverseSearcher<OsStr>, + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::rmatch_indices(self, needle) + } + + /// An iterator over the disjoint matches of a needle within this `OsStr` + /// as well as the index ranges of each match. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn match_ranges<'a, P>(&'a self, needle: P) -> ext::MatchRanges<&'a OsStr, P::Searcher> + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::match_ranges(self, needle) + } + + /// An iterator over the disjoint matches of a needle within this `OsStr`, + /// yielded in reverse order along with the index ranges of each match. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn rmatch_ranges<'a, P>(&'a self, needle: P) -> ext::RMatchRanges<&'a OsStr, P::Searcher> + where + P: Needle<&'a OsStr>, + P::Searcher: ReverseSearcher<OsStr>, + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::rmatch_ranges(self, needle) + } + + /// An iterator over slices of this `OsStr`, separated by parts matched by + /// the needle. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn split<'a, P>(&'a self, needle: P) -> ext::Split<&'a OsStr, P::Searcher> + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::split(self, needle) + } + + /// An iterator over slices of this `OsStr`, separated by parts matched by + /// the needle and yielded in reverse order. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn rsplit<'a, P>(&'a self, needle: P) -> ext::RSplit<&'a OsStr, P::Searcher> + where + P: Needle<&'a OsStr>, + P::Searcher: ReverseSearcher<OsStr>, + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::rsplit(self, needle) + } + + /// An iterator over slices of this `OsStr`, separated by parts matched by + /// the needle. /// - /// Note: it is *crucial* that this API is private, to avoid - /// revealing the internal, platform-specific encodings. - fn bytes(&self) -> &[u8] { - unsafe { &*(&self.inner as *const _ as *const [u8]) } + /// Equivalent to [`split`](#method.split), except that the trailing slice + /// is skipped if empty. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn split_terminator<'a, P>(&'a self, needle: P) + -> ext::SplitTerminator<&'a OsStr, P::Searcher> + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::split_terminator(self, needle) + } + + /// An iterator over slices of this `OsStr`, separated by parts matched by + /// the needle and yielded in reverse order. + /// + /// Equivalent to [`rsplit`](#method.rsplit), except that the trailing slice + /// is skipped if empty. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn rsplit_terminator<'a, P>(&'a self, needle: P) + -> ext::RSplitTerminator<&'a OsStr, P::Searcher> + where + P: Needle<&'a OsStr>, + P::Searcher: ReverseSearcher<OsStr>, + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::rsplit_terminator(self, needle) + } + + /// An iterator over slices of the given `OsStr`, separated by a needle, + /// restricted to returning at most `n` items. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn splitn<'a, P>(&'a self, n: usize, needle: P) -> ext::SplitN<&'a OsStr, P::Searcher> + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::splitn(self, n, needle) + } + + /// An iterator over slices of the given `OsStr`, separated by a needle, + /// starting from the end of the `OsStr`, restricted to returning at most + /// `n` items. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn rsplitn<'a, P>(&'a self, n: usize, needle: P) -> ext::RSplitN<&'a OsStr, P::Searcher> + where + P: Needle<&'a OsStr>, + P::Searcher: ReverseSearcher<OsStr>, + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + ext::rsplitn(self, n, needle) + } + + /// Replaces all matches of a needle with another `OsStr`. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn replace<'s: 'a, 'a, P>(&'s self, from: P, to: &'a OsStr) -> OsString + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + let mut result = OsString::with_capacity(self.len()); + ext::replace_with(self, from, |_| to, |s| result.push(s)); + result + } + + /// Replaces first N matches of a needle with another `OsStr`. + #[unstable(feature = "os_str_needle_methods", issue = "56345")] + #[inline] + pub fn replacen<'s: 'a, 'a, P>(&'s self, from: P, to: &'a OsStr, count: usize) -> OsString + where + P: Needle<&'a OsStr>, + P::Searcher: Searcher<OsStr>, // FIXME: RFC 2089 + P::Consumer: Consumer<OsStr>, // FIXME: RFC 2089 + { + let mut result = OsString::with_capacity(self.len()); + ext::replacen_with(self, from, |_| to, count, |s| result.push(s)); + result } } @@ -789,7 +1137,7 @@ impl Default for &OsStr { #[stable(feature = "rust1", since = "1.0.0")] impl PartialEq for OsStr { fn eq(&self, other: &OsStr) -> bool { - self.bytes().eq(other.bytes()) + self.inner == other.inner } } @@ -814,16 +1162,16 @@ impl Eq for OsStr {} impl PartialOrd for OsStr { #[inline] fn partial_cmp(&self, other: &OsStr) -> Option<cmp::Ordering> { - self.bytes().partial_cmp(other.bytes()) + self.inner.partial_cmp(&other.inner) } #[inline] - fn lt(&self, other: &OsStr) -> bool { self.bytes().lt(other.bytes()) } + fn lt(&self, other: &OsStr) -> bool { self.inner < other.inner } #[inline] - fn le(&self, other: &OsStr) -> bool { self.bytes().le(other.bytes()) } + fn le(&self, other: &OsStr) -> bool { self.inner <= other.inner } #[inline] - fn gt(&self, other: &OsStr) -> bool { self.bytes().gt(other.bytes()) } + fn gt(&self, other: &OsStr) -> bool { self.inner > other.inner } #[inline] - fn ge(&self, other: &OsStr) -> bool { self.bytes().ge(other.bytes()) } + fn ge(&self, other: &OsStr) -> bool { self.inner >= other.inner } } #[stable(feature = "rust1", since = "1.0.0")] @@ -840,7 +1188,7 @@ impl PartialOrd<str> for OsStr { #[stable(feature = "rust1", since = "1.0.0")] impl Ord for OsStr { #[inline] - fn cmp(&self, other: &OsStr) -> cmp::Ordering { self.bytes().cmp(other.bytes()) } + fn cmp(&self, other: &OsStr) -> cmp::Ordering { self.inner.cmp(&other.inner) } } macro_rules! impl_cmp { @@ -885,7 +1233,7 @@ impl_cmp!(Cow<'a, OsStr>, OsString); impl Hash for OsStr { #[inline] fn hash<H: Hasher>(&self, state: &mut H) { - self.bytes().hash(state) + self.inner.hash(state) } } @@ -966,6 +1314,46 @@ impl AsInner<Slice> for OsStr { } } +#[stable(feature = "os_str_slice", since = "1.36.0")] +impl ops::Index<ops::RangeFull> for OsStr { + type Output = OsStr; + + #[inline] + fn index(&self, _: ops::RangeFull) -> &OsStr { + self + } +} + +#[stable(feature = "os_str_slice", since = "1.36.0")] +impl ops::Index<ops::Range<usize>> for OsStr { + type Output = OsStr; + + #[inline] + fn index(&self, index: ops::Range<usize>) -> &OsStr { + OsStr::from_inner(&self.inner[index]) + } +} + +#[stable(feature = "os_str_slice", since = "1.36.0")] +impl ops::Index<ops::RangeFrom<usize>> for OsStr { + type Output = OsStr; + + #[inline] + fn index(&self, index: ops::RangeFrom<usize>) -> &OsStr { + OsStr::from_inner(&self.inner[index]) + } +} + +#[stable(feature = "os_str_slice", since = "1.36.0")] +impl ops::Index<ops::RangeTo<usize>> for OsStr { + type Output = OsStr; + + #[inline] + fn index(&self, index: ops::RangeTo<usize>) -> &OsStr { + OsStr::from_inner(&self.inner[index]) + } +} + #[cfg(test)] mod tests { use super::*; @@ -1133,4 +1521,408 @@ mod tests { assert_eq!(&*rc2, os_str); assert_eq!(&*arc2, os_str); } + + #[test] + fn slice_with_utf8_boundary() { + let os_str = OsStr::new("Hello🌍🌎🌏"); + assert_eq!(os_str.len(), 17); + + assert_eq!(os_str, &os_str[..]); + assert_eq!(os_str, &os_str[..17]); + assert_eq!(os_str, &os_str[0..]); + assert_eq!(os_str, &os_str[0..17]); + + assert_eq!(OsStr::new("Hello"), &os_str[..5]); + assert_eq!(OsStr::new("🌎🌏"), &os_str[9..]); + assert_eq!(OsStr::new("lo🌍"), &os_str[3..9]); + + let os_string = os_str.to_owned(); + assert_eq!(os_str, &os_string[..]); + assert_eq!(os_str, &os_string[..17]); + assert_eq!(os_str, &os_string[0..]); + assert_eq!(os_str, &os_string[0..17]); + + assert_eq!(OsStr::new("Hello"), &os_string[..5]); + assert_eq!(OsStr::new("🌎🌏"), &os_string[9..]); + assert_eq!(OsStr::new("lo🌍"), &os_string[3..9]); + } + + #[test] + #[cfg(any(unix, target_os = "redox", target_arch = "wasm32"))] + fn slice_with_non_utf8_boundary_unix() { + #[cfg(unix)] + use crate::os::unix::ffi::OsStrExt; + #[cfg(target_os = "redox")] + use crate::os::redox::ffi::OsStrExt; + + let os_str = OsStr::new("Hello🌍🌎🌏"); + assert_eq!(OsStr::from_bytes(b"Hello\xf0"), &os_str[..6]); + assert_eq!(OsStr::from_bytes(b"\x9f\x8c\x8e\xf0\x9f\x8c\x8f"), &os_str[10..]); + assert_eq!(OsStr::from_bytes(b"\x8d\xf0\x9f\x8c\x8e"), &os_str[8..13]); + } + + #[test] + #[cfg(windows)] + fn slice_with_non_utf8_boundary_windows() { + use crate::os::windows::ffi::OsStringExt; + + let os_str = OsStr::new("Hello🌍🌎🌏"); + assert_eq!(OsString::from_wide(&[0x48, 0x65, 0x6C, 0x6C, 0x6F, 0xD83C]), &os_str[..7]); + assert_eq!(OsString::from_wide(&[0xDF0E, 0xD83C, 0xDF0F]), &os_str[11..]); + assert_eq!(OsString::from_wide(&[0xDF0D, 0xD83C]), &os_str[7..11]); + } +} + +#[unstable(feature = "needle", issue = "56345")] +unsafe impl Hay for OsStr { + type Index = usize; + + #[inline] + fn empty<'a>() -> &'a Self { + Self::new("") + } + + #[inline] + fn start_index(&self) -> usize { + 0 + } + + #[inline] + fn end_index(&self) -> usize { + self.len() + } + + #[inline] + unsafe fn slice_unchecked(&self, range: ops::Range<usize>) -> &Self { + &self[range] + } + + #[inline] + unsafe fn next_index(&self, index: usize) -> usize { + self.inner.next_index(index) + } + + #[inline] + unsafe fn prev_index(&self, index: usize) -> usize { + self.inner.prev_index(index) + } +} + +// use a macro here since the type of `hay.inner.inner` is platform dependent +// and we don't want to expose that type. +macro_rules! span_as_inner { + ($span:expr) => {{ + let (hay, range) = $span.into_parts(); + unsafe { Span::from_parts(&hay.inner.inner, range) } + }} +} + +fn span_as_inner_bytes(span: Span<&OsStr>) -> Span<&[u8]> { + let (hay, range) = span.into_parts(); + unsafe { Span::from_parts(hay.inner.as_bytes_for_searcher(), range) } +} + +#[unstable(feature = "needle", issue = "56345")] +unsafe impl<'p> Searcher<OsStr> for TwoWaySearcher<'p, u8> { + #[inline] + fn search(&mut self, span: Span<&OsStr>) -> Option<ops::Range<usize>> { + self.search(span_as_inner_bytes(span)) + } } + +#[unstable(feature = "needle", issue = "56345")] +unsafe impl<'p> ReverseSearcher<OsStr> for TwoWaySearcher<'p, u8> { + #[inline] + fn rsearch(&mut self, span: Span<&OsStr>) -> Option<ops::Range<usize>> { + self.rsearch(span_as_inner_bytes(span)) + } +} + +#[unstable(feature = "needle", issue = "56345")] +unsafe impl<'p> Consumer<OsStr> for NaiveSearcher<'p, u8> { + #[inline] + fn consume(&mut self, span: Span<&OsStr>) -> Option<usize> { + self.consume(span_as_inner_bytes(span)) + } + + #[inline] + fn trim_start(&mut self, hay: &OsStr) -> usize { + self.trim_start(hay.inner.as_bytes_for_searcher()) + } +} + +#[unstable(feature = "needle", issue = "56345")] +unsafe impl<'p> ReverseConsumer<OsStr> for NaiveSearcher<'p, u8> { + #[inline] + fn rconsume(&mut self, span: Span<&OsStr>) -> Option<usize> { + self.rconsume(span_as_inner_bytes(span)) + } + + #[inline] + fn trim_end(&mut self, hay: &OsStr) -> usize { + self.trim_end(hay.inner.as_bytes_for_searcher()) + } +} + +#[unstable(feature = "needle", issue = "56345")] +#[derive(Debug)] +pub struct OsStrSearcher<S>(InnerSearcher<S>); + +#[unstable(feature = "needle", issue = "56345")] +unsafe impl<'p> Searcher<OsStr> for OsStrSearcher<SliceSearcher<'p, u8>> { + #[inline] + fn search(&mut self, span: Span<&OsStr>) -> Option<ops::Range<usize>> { + self.0.search(span_as_inner!(span)) + } +} + +#[unstable(feature = "needle", issue = "56345")] +unsafe impl<'p> ReverseSearcher<OsStr> for OsStrSearcher<SliceSearcher<'p, u8>> { + #[inline] + fn rsearch(&mut self, span: Span<&OsStr>) -> Option<ops::Range<usize>> { + self.0.rsearch(span_as_inner!(span)) + } +} + +#[unstable(feature = "needle", issue = "56345")] +unsafe impl<'p> Consumer<OsStr> for OsStrSearcher<NaiveSearcher<'p, u8>> { + #[inline] + fn consume(&mut self, span: Span<&OsStr>) -> Option<usize> { + self.0.consume(span_as_inner!(span)) + } + + #[inline] + fn trim_start(&mut self, hay: &OsStr) -> usize { + self.0.trim_start(&hay.inner.inner) + } +} + +#[unstable(feature = "needle", issue = "56345")] +unsafe impl<'p> ReverseConsumer<OsStr> for OsStrSearcher<NaiveSearcher<'p, u8>> { + #[inline] + fn rconsume(&mut self, span: Span<&OsStr>) -> Option<usize> { + self.0.rconsume(span_as_inner!(span)) + } + + #[inline] + fn trim_end(&mut self, hay: &OsStr) -> usize { + self.0.trim_end(&hay.inner.inner) + } +} + +#[unstable(feature = "needle", issue = "56345")] +impl<'p, H: Haystack<Target = OsStr>> Needle<H> for &'p OsStr { + type Searcher = OsStrSearcher<SliceSearcher<'p, u8>>; + type Consumer = OsStrSearcher<NaiveSearcher<'p, u8>>; + + fn into_searcher(self) -> Self::Searcher { + OsStrSearcher(self.inner.into_searcher()) + } + + fn into_consumer(self) -> Self::Consumer { + OsStrSearcher(self.inner.into_consumer()) + } +} + +// FIXME cannot impl `Needle<(_: Haystack<Target = OsStr>)>` due to RFC 1672 being postponed. +// (need to wait for chalk) +#[unstable(feature = "needle", issue = "56345")] +impl<'h, 'p> Needle<&'h OsStr> for &'p str { + type Searcher = SliceSearcher<'p, u8>; + type Consumer = NaiveSearcher<'p, u8>; + + fn into_searcher(self) -> Self::Searcher { + SliceSearcher::new(self.as_bytes()) + } + + fn into_consumer(self) -> Self::Consumer { + NaiveSearcher::new(self.as_bytes()) + } +} + +#[cfg(test)] +mod needle_tests { + use super::*; + + #[cfg(windows)] + use crate::os::windows::ffi::OsStringExt; + #[cfg(unix)] + use crate::os::unix::ffi::OsStrExt; + + #[test] + #[cfg(any(unix, target_os = "redox", target_arch = "wasm32"))] + fn test_trim() { + assert_eq!( + OsStr::from_bytes(b"\xaa\xbb\xaa\xcc\xaa\xbb\xaa") + .trim_start_matches(OsStr::from_bytes(b"\xaa")), + OsStr::from_bytes(b"\xbb\xaa\xcc\xaa\xbb\xaa"), + ); + assert_eq!( + OsStr::from_bytes(b"\xaa\xbb\xaa\xcc\xaa\xbb\xaa") + .trim_end_matches(OsStr::from_bytes(b"\xaa")), + OsStr::from_bytes(b"\xaa\xbb\xaa\xcc\xaa\xbb"), + ); + } + + #[test] + #[cfg(windows)] + fn test_trim_start_low_surrogate() { + let pat = OsString::from_wide(&[0xdc00]); + let a = &OsStr::new("\u{10000}aaa")[2..]; + assert_eq!(a.trim_start_matches(&*pat), OsStr::new("aaa")); + + let b = OsString::from_wide(&[0xd800, 0xdc00, 0xdc00, 0x62, 0x62, 0x62]); + assert_eq!(b[2..].trim_start_matches(&*pat), OsStr::new("bbb")); + + let c = OsString::from_wide(&[0xdc00, 0xdc00, 0x63, 0x63, 0x63]); + assert_eq!(c.trim_start_matches(&*pat), OsStr::new("ccc")); + + let d = &OsStr::new("\u{ffc00}ddd")[2..]; + assert_eq!(d.trim_start_matches(&*pat), OsStr::new("ddd")); + + let e = OsStr::new("㰀eee"); + assert_eq!(e.trim_start_matches(&*pat), e); + } + + #[test] + #[cfg(windows)] + fn test_trim_start_high_surrogate() { + let pat = OsString::from_wide(&[0xd800]); + let a = OsStr::new("\u{10000}"); + assert_eq!(a.trim_start_matches(&*pat), &*OsString::from_wide(&[0xdc00])); + + let b = OsString::from_wide(&[0xd800, 0x62, 0x62, 0x62]); + assert_eq!(b.trim_start_matches(&*pat), OsStr::new("bbb")); + + let c = OsString::from_wide(&[0xd800, 0xd800, 0xdc00, 0x63, 0x63, 0x63]); + assert_eq!(c.trim_start_matches(&*pat), &c[5..]); + } + + #[test] + #[cfg(windows)] + fn test_trim_end_high_surrogate() { + let pat = OsString::from_wide(&[0xd800]); + let a = OsStr::new("aaa\u{10000}"); + assert_eq!(a[..a.len()-2].trim_end_matches(&*pat), OsStr::new("aaa")); + + let b = OsString::from_wide(&[0x62, 0x62, 0x62, 0xd800, 0xd800, 0xdc00]); + assert_eq!(b[..b.len()-2].trim_end_matches(&*pat), OsStr::new("bbb")); + + let c = OsString::from_wide(&[0x63, 0x63, 0x63, 0xd800, 0xd800]); + assert_eq!(c.trim_end_matches(&*pat), OsStr::new("ccc")); + + let d = OsStr::new("ddd\u{103ff}"); + assert_eq!(d[..d.len()-2].trim_end_matches(&*pat), OsStr::new("ddd")); + + let e = OsStr::new("eee\u{11000}"); + let e = &e[..e.len()-2]; + assert_eq!(e.trim_end_matches(&*pat), e); + + let f = OsString::from_wide(&[0x66, 0x66, 0x66, 0xdc00]); + assert_eq!(f.trim_end_matches(&*pat), &*f); + } + + + #[test] + #[cfg(windows)] + fn test_trim_end_low_surrogate() { + let pat = OsString::from_wide(&[0xdc00]); + let a = OsStr::new("\u{10000}"); + assert_eq!(a.trim_end_matches(&*pat), &*OsString::from_wide(&[0xd800])); + + let b = OsString::from_wide(&[0x62, 0x62, 0x62, 0xdc00]); + assert_eq!(b.trim_end_matches(&*pat), OsStr::new("bbb")); + + let c = OsString::from_wide(&[0x63, 0x63, 0x63, 0xdbff, 0xdc00, 0xdc00]); + assert_eq!(c.trim_end_matches(&*pat), &c[..c.len()-5]); + } + + #[test] + #[cfg(windows)] + fn test_match_string_with_surrogates() { + let haystack = &OsStr::new("\u{10000}a\u{10000}a\u{10000}\u{10000}")[2..16]; + // 0..3 = U+DC00 + // 3..4 = 'a' + // 4..6 = U+D800 + // 6..8 = U+DC00 + // 8..9 = 'a' + // 9..11 = U+D800 + // 11..13 = U+DC00 + // 13..16 = U+D800 + + let pat = "a"; + let matched_pat = OsStr::new(pat); + assert_eq!(haystack.match_ranges(pat).collect::<Vec<_>>(), vec![ + (3..4, matched_pat), + (8..9, matched_pat), + ]); + assert_eq!(haystack.rmatch_ranges(pat).collect::<Vec<_>>(), vec![ + (8..9, matched_pat), + (3..4, matched_pat), + ]); + + let pat = OsString::from_wide(&[0xdc00, 0x61]); + assert_eq!(haystack.match_ranges(&*pat).collect::<Vec<_>>(), vec![ + (0..4, &*pat), + (6..9, &*pat), + ]); + assert_eq!(haystack.rmatch_ranges(&*pat).collect::<Vec<_>>(), vec![ + (6..9, &*pat), + (0..4, &*pat), + ]); + + let pat = OsString::from_wide(&[0x61, 0xd800]); + assert_eq!(haystack.match_ranges(&*pat).collect::<Vec<_>>(), vec![ + (3..6, &*pat), + (8..11, &*pat), + ]); + assert_eq!(haystack.rmatch_ranges(&*pat).collect::<Vec<_>>(), vec![ + (8..11, &*pat), + (3..6, &*pat), + ]); + + let pat = "\u{10000}"; + let matched_pat = OsStr::new(pat); + assert_eq!(haystack.match_ranges(pat).collect::<Vec<_>>(), vec![ + (4..8, matched_pat), + (9..13, matched_pat), + ]); + assert_eq!(haystack.rmatch_ranges(pat).collect::<Vec<_>>(), vec![ + (9..13, matched_pat), + (4..8, matched_pat), + ]); + + let pat = OsString::from_wide(&[0xd800]); + assert_eq!(haystack.match_ranges(&*pat).collect::<Vec<_>>(), vec![ + (4..6, &*pat), + (9..11, &*pat), + (13..16, &*pat), + ]); + assert_eq!(haystack.rmatch_ranges(&*pat).collect::<Vec<_>>(), vec![ + (13..16, &*pat), + (9..11, &*pat), + (4..6, &*pat), + ]); + + let pat = OsString::from_wide(&[0xdc00]); + assert_eq!(haystack.match_ranges(&*pat).collect::<Vec<_>>(), vec![ + (0..3, &*pat), + (6..8, &*pat), + (11..13, &*pat), + ]); + assert_eq!(haystack.rmatch_ranges(&*pat).collect::<Vec<_>>(), vec![ + (11..13, &*pat), + (6..8, &*pat), + (0..3, &*pat), + ]); + + let pat = OsString::from_wide(&[0xdc00, 0xd800]); + assert_eq!(haystack.match_ranges(&*pat).collect::<Vec<_>>(), vec![ + (11..16, &*pat), + ]); + assert_eq!(haystack.rmatch_ranges(&*pat).collect::<Vec<_>>(), vec![ + (11..16, &*pat), + ]); + } +} + diff --git a/src/libstd/lib.rs b/src/libstd/lib.rs index 62bc1991cc93c..e17a98d3a6b51 100644 --- a/src/libstd/lib.rs +++ b/src/libstd/lib.rs @@ -274,6 +274,7 @@ #![feature(link_args)] #![feature(linkage)] #![feature(maybe_uninit)] +#![feature(needle)] #![feature(needs_panic_runtime)] #![feature(never_type)] #![feature(nll)] @@ -285,6 +286,7 @@ #![feature(panic_unwind)] #![feature(prelude_import)] #![feature(ptr_internals)] +#![feature(ptr_offset_from)] #![feature(raw)] #![feature(renamed_spin_loop)] #![feature(rustc_attrs)] @@ -293,6 +295,7 @@ #![feature(shrink_to)] #![feature(slice_concat_ext)] #![feature(slice_internals)] +#![feature(slice_needle_methods)] #![feature(slice_patterns)] #![feature(staged_api)] #![feature(std_internals)] @@ -436,6 +439,8 @@ pub use core::char; pub use core::u128; #[stable(feature = "core_hint", since = "1.27.0")] pub use core::hint; +#[unstable(feature = "needle", issue = "56345")] +pub use core::needle; pub mod f32; pub mod f64; diff --git a/src/libstd/path.rs b/src/libstd/path.rs index 1bbda9b5bcb1a..bd099d42ff492 100644 --- a/src/libstd/path.rs +++ b/src/libstd/path.rs @@ -315,7 +315,7 @@ unsafe fn u8_slice_as_os_str(s: &[u8]) -> &OsStr { // Detect scheme on Redox fn has_redox_scheme(s: &[u8]) -> bool { - cfg!(target_os = "redox") && s.split(|b| *b == b'/').next().unwrap_or(b"").contains(&b':') + cfg!(target_os = "redox") && s.split_match(b"/").next().unwrap_or(b"").contains(&b':') } //////////////////////////////////////////////////////////////////////////////// @@ -344,7 +344,7 @@ fn split_file_at_dot(file: &OsStr) -> (Option<&OsStr>, Option<&OsStr>) { // contents of the encoding and (2) new &OsStr values are produced // only from ASCII-bounded slices of existing &OsStr values. - let mut iter = os_str_as_u8_slice(file).rsplitn(2, |b| *b == b'.'); + let mut iter = os_str_as_u8_slice(file).rsplitn_match(2, b"."); let after = iter.next(); let before = iter.next(); if before == Some(b"") { diff --git a/src/libstd/sys/windows/args.rs b/src/libstd/sys/windows/args.rs index b04bb484eedb9..6885dc6c9851a 100644 --- a/src/libstd/sys/windows/args.rs +++ b/src/libstd/sys/windows/args.rs @@ -64,7 +64,7 @@ unsafe fn parse_lp_cmd_line<F: Fn() -> OsString>(lp_cmd_line: *const u16, exe_na // no matter what. QUOTE => { let args = { - let mut cut = cmd_line[1..].splitn(2, |&c| c == QUOTE); + let mut cut = cmd_line[1..].splitn(2, |&c: &u16| c == QUOTE); if let Some(exe) = cut.next() { ret_val.push(OsString::from_wide(exe)); } @@ -89,7 +89,7 @@ unsafe fn parse_lp_cmd_line<F: Fn() -> OsString>(lp_cmd_line: *const u16, exe_na // no matter what. _ => { let args = { - let mut cut = cmd_line.splitn(2, |&c| c > 0 && c <= SPACE); + let mut cut = cmd_line.splitn(2, |&c: &u16| c > 0 && c <= SPACE); if let Some(exe) = cut.next() { ret_val.push(OsString::from_wide(exe)); } diff --git a/src/libstd/sys/windows/os_str.rs b/src/libstd/sys/windows/os_str.rs index c7a82e092528e..070b97fc0cb90 100644 --- a/src/libstd/sys/windows/os_str.rs +++ b/src/libstd/sys/windows/os_str.rs @@ -3,11 +3,14 @@ use crate::borrow::Cow; use crate::fmt; -use crate::sys_common::wtf8::{Wtf8, Wtf8Buf}; +use crate::sys_common::wtf8::{self, Wtf8, Wtf8Buf}; use crate::mem; use crate::rc::Rc; use crate::sync::Arc; +use crate::ops::{Index, Range, RangeFrom, RangeTo}; use crate::sys_common::{AsInner, IntoInner, FromInner}; +use core::slice::needles::{SliceSearcher, NaiveSearcher}; +use crate::needle::Hay; #[derive(Clone, Hash)] pub struct Buf { @@ -44,6 +47,7 @@ impl fmt::Display for Buf { } } +#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Slice { pub inner: Wtf8 } @@ -60,6 +64,30 @@ impl fmt::Display for Slice { } } +impl Index<Range<usize>> for Slice { + type Output = Slice; + + fn index(&self, range: Range<usize>) -> &Slice { + unsafe { mem::transmute(&self.inner[range]) } + } +} + +impl Index<RangeFrom<usize>> for Slice { + type Output = Slice; + + fn index(&self, range: RangeFrom<usize>) -> &Slice { + unsafe { mem::transmute(&self.inner[range]) } + } +} + +impl Index<RangeTo<usize>> for Slice { + type Output = Slice; + + fn index(&self, range: RangeTo<usize>) -> &Slice { + unsafe { mem::transmute(&self.inner[range]) } + } +} + impl Buf { pub fn with_capacity(capacity: usize) -> Buf { Buf { @@ -169,4 +197,26 @@ impl Slice { let rc = self.inner.into_rc(); unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Slice) } } + + pub unsafe fn next_index(&self, index: usize) -> usize { + self.inner.next_index(index) + } + + pub unsafe fn prev_index(&self, index: usize) -> usize { + self.inner.prev_index(index) + } + + pub fn into_searcher(&self) -> InnerSearcher<SliceSearcher<'_, u8>> { + wtf8::new_wtf8_searcher(&self.inner) + } + + pub fn into_consumer(&self) -> InnerSearcher<NaiveSearcher<'_, u8>> { + wtf8::new_wtf8_consumer(&self.inner) + } + + pub fn as_bytes_for_searcher(&self) -> &[u8] { + self.inner.as_inner() + } } + +pub use crate::sys_common::wtf8::Wtf8Searcher as InnerSearcher; diff --git a/src/libstd/sys_common/os_str_bytes.rs b/src/libstd/sys_common/os_str_bytes.rs index a4961974d89ab..8782734040842 100644 --- a/src/libstd/sys_common/os_str_bytes.rs +++ b/src/libstd/sys_common/os_str_bytes.rs @@ -6,18 +6,22 @@ use crate::ffi::{OsStr, OsString}; use crate::fmt; use crate::str; use crate::mem; +use crate::ops::{Index, Range, RangeFrom, RangeTo}; use crate::rc::Rc; use crate::sync::Arc; use crate::sys_common::{FromInner, IntoInner, AsInner}; use crate::sys_common::bytestring::debug_fmt_bytestring; use core::str::lossy::Utf8Lossy; +use core::slice::needles::{SliceSearcher, NaiveSearcher}; +use crate::needle::Hay; #[derive(Clone, Hash)] pub(crate) struct Buf { pub inner: Vec<u8> } +#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)] pub(crate) struct Slice { pub inner: [u8] } @@ -34,6 +38,30 @@ impl fmt::Display for Slice { } } +impl Index<Range<usize>> for Slice { + type Output = Slice; + + fn index(&self, range: Range<usize>) -> &Slice { + Slice::from_u8_slice(&self.inner[range]) + } +} + +impl Index<RangeFrom<usize>> for Slice { + type Output = Slice; + + fn index(&self, range: RangeFrom<usize>) -> &Slice { + Slice::from_u8_slice(&self.inner[range]) + } +} + +impl Index<RangeTo<usize>> for Slice { + type Output = Slice; + + fn index(&self, range: RangeTo<usize>) -> &Slice { + Slice::from_u8_slice(&self.inner[range]) + } +} + impl fmt::Debug for Buf { fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Debug::fmt(self.as_slice(), formatter) @@ -178,8 +206,31 @@ impl Slice { let rc: Rc<[u8]> = Rc::from(&self.inner); unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Slice) } } + + pub unsafe fn next_index(&self, index: usize) -> usize { + self.inner.next_index(index) + } + + pub unsafe fn prev_index(&self, index: usize) -> usize { + self.inner.prev_index(index) + } + + pub fn into_searcher(&self) -> SliceSearcher<'_, u8> { + SliceSearcher::new(&self.inner) + } + + pub fn into_consumer(&self) -> NaiveSearcher<'_, u8> { + NaiveSearcher::new(&self.inner) + } + + pub fn as_bytes_for_searcher(&self) -> &[u8] { + &self.inner + } } +#[unstable(feature = "needle", issue = "56345")] +pub type InnerSearcher<S> = S; + /// Platform-specific extensions to [`OsString`]. /// /// [`OsString`]: ../../../../std/ffi/struct.OsString.html diff --git a/src/libstd/sys_common/wtf8.rs b/src/libstd/sys_common/wtf8.rs index f17020de44ec5..72beefc7d1064 100644 --- a/src/libstd/sys_common/wtf8.rs +++ b/src/libstd/sys_common/wtf8.rs @@ -1,4 +1,5 @@ -//! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). +//! Implementation of [the WTF-8](https://simonsapin.github.io/wtf-8/) and +//! [OMG-WTF-8](https://github.com/kennytm/omgwtf8) encodings. //! //! This library uses Rust’s type system to maintain //! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed), @@ -15,94 +16,180 @@ // unix (it's mostly used on windows), so don't worry about dead code here. #![allow(dead_code)] -use core::str::next_code_point; - use crate::borrow::Cow; -use crate::char; +use crate::cmp; use crate::fmt; use crate::hash::{Hash, Hasher}; -use crate::iter::FromIterator; +use crate::marker::PhantomData; use crate::mem; -use crate::ops; +use crate::num::NonZeroU16; +use crate::ops::{self, Range}; use crate::rc::Rc; use crate::slice; use crate::str; use crate::sync::Arc; use crate::sys_common::AsInner; +use crate::needle::{Hay, Span, Searcher, ReverseSearcher, Consumer, ReverseConsumer}; +use core::slice::needles::{NaiveSearcher, SliceSearcher}; const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}"; -/// A Unicode code point: from U+0000 to U+10FFFF. +/// Represents a high surrogate code point. /// -/// Compares with the `char` type, -/// which represents a Unicode scalar value: -/// a code point that is not a surrogate (U+D800 to U+DFFF). -#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)] -pub struct CodePoint { - value: u32 +/// Internally, the value is the last 2 bytes of the surrogate in its canonical +/// (WTF-8) representation, e.g. U+D800 is `ed a0 80` in WTF-8, so the value +/// stored here would be `0xa080`. This also means the valid range of this type +/// must be `0xa080..=0xafbf`. +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub(super) struct HighSurrogate(NonZeroU16); +impl HighSurrogate { + fn decode(self) -> [u8; 3] { + let c = self.0.get(); + [0xed, (c >> 8) as u8, c as u8] + } + + pub(super) fn value(self) -> u16 { + self.0.get() + } } -/// Format the code point as `U+` followed by four to six hexadecimal digits. -/// Example: `U+1F4A9` -impl fmt::Debug for CodePoint { - #[inline] - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(formatter, "U+{:04X}", self.value) +/// Represents a low surrogate code point. +/// +/// Internally, the value is the last 2 bytes of the surrogate in its canonical +/// (WTF-8) representation, e.g. U+DC00 is `ed b0 80` in WTF-8, so the value +/// stored here would be `0xb080`. This also means the valid range of this type +/// must be `0xb080..=0xbfbf`. +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub(super) struct LowSurrogate(NonZeroU16); +impl LowSurrogate { + fn decode(self) -> [u8; 3] { + let c = self.0.get(); + [0xed, (c >> 8) as u8, c as u8] + } + + pub(super) fn value(self) -> u16 { + self.0.get() } } -impl CodePoint { - /// Unsafely creates a new `CodePoint` without checking the value. - /// - /// Only use when `value` is known to be less than or equal to 0x10FFFF. - #[inline] - pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint { - CodePoint { value } +fn decode_surrogate_pair(high: HighSurrogate, low: LowSurrogate) -> [u8; 4] { + // we want to transform the bits from: + // + // high surrogate' low surrogate + // 101wvuts 10rqpnmk 1011jihg 10fedcba + // to + // UTF-8 + // 11110wvu 10tsrqpn 10mkjihg 10fedcba + // ... + + // lo & 0xfff = 00000000 00000000 0000jihg 10fedbca + // + // hi << 12 = 0000101w vuts10rq pnmk0000 00000000 + // ... & 0x303000 = 00000000 00ts0000 00mk0000 00000000 + // + // hi << 14 = 00101wvu ts10rqpn mk000000 00000000 + // ... & 0x70f0000 = 00000wvu 0000rqpn 00000000 00000000 + // + // 0xf0808000 = 11110000 10000000 10000000 00000000 + // + // ... | ... = 11110wvu 10tsrqpn 10mkjihg 10fedcba + let lo = low.0.get() as u32; + let hi = (high.0.get() as u32) + 0x100; + let combined = (lo & 0xfff) | (hi << 12 & 0x303000) | (hi << 14 & 0x70f0000) | 0xf0808000; + combined.to_be_bytes() +} + +#[test] +fn test_decode_surrogate_pair() { + fn consume(hi: u16, lo: u16, utf8: [u8; 4]) { + let high = HighSurrogate(NonZeroU16::new(hi).unwrap()); + let low = LowSurrogate(NonZeroU16::new(lo).unwrap()); + assert_eq!(decode_surrogate_pair(high, low), utf8); } + consume(0xa080, 0xb080, [0xf0, 0x90, 0x80, 0x80]); + consume(0xa0bd, 0xb88d, [0xf0, 0x9f, 0x98, 0x8d]); + consume(0xafbf, 0xbfbf, [0xf4, 0x8f, 0xbf, 0xbf]); +} + - /// Creates a new `CodePoint` if the value is a valid code point. +/// Represents a 3-byte sequence as part of a well-formed OMG-WTF-8 sequence. +/// +/// Internally, the sequence is encoded as a big-endian integer to simplify +/// computation (not using native endian here since there's no advantage in +/// reading *3* bytes). +#[derive(Copy, Clone)] +pub(super) struct ThreeByteSeq(u32); +impl ThreeByteSeq { + fn to_high_surrogate_from_split_repr_unchecked(self) -> u16 { + // the high surrogate in split representation has bit pattern + // + // self.0 = ******** 11110kji 10hgfedc 10ba**** + // + // thus: + // self.0 >> 4 = 0000**** ****1111 0kji10hg fedc10ba + // 0x303 = 00000000 00000000 00000011 00000011 + // & = 00000000 00000000 000000hg 000000ba + // + // self.0 >> 6 = 000000** ******11 110kji10 hgfedc10 + // 0x3c3c = 00000000 00000000 00111100 00111100 + // & = 00000000 00000000 000kji00 00fedc00 + // + // ... | ... = 00000000 00000000 000kjihg 00fedcba + // + // The -0x100 is to account for the UTF-16 offset. The final + // 0xa080 is to make the final bit patterns compare the same as + // the canonical representation. + // + (((self.0 >> 4 & 0x303 | self.0 >> 6 & 0x3c3c) - 0x100) | 0xa080) as u16 + } + + /// Obtains the high surrogate value from this 3-byte sequence. /// - /// Returns `None` if `value` is above 0x10FFFF. - #[inline] - pub fn from_u32(value: u32) -> Option<CodePoint> { - match value { - 0 ..= 0x10FFFF => Some(CodePoint { value }), - _ => None - } + /// If the input is not a high surrogate, returns None. + fn to_high_surrogate(self) -> Option<HighSurrogate> { + let surrogate_value = match self.0 { + // canonical representation + 0xeda000..=0xedafff => self.0 as u16, + // split representation + 0xf00000..=0xffffffff => self.to_high_surrogate_from_split_repr_unchecked(), + _ => 0, + }; + NonZeroU16::new(surrogate_value).map(HighSurrogate) } - /// Creates a new `CodePoint` from a `char`. + /// Obtains the low surrogate value from this 3-byte sequence. /// - /// Since all Unicode scalar values are code points, this always succeeds. - #[inline] - pub fn from_char(value: char) -> CodePoint { - CodePoint { value: value as u32 } + /// If the input is not a low surrogate, returns None. + fn to_low_surrogate(self) -> Option<LowSurrogate> { + let surrogate_value = match self.0 { + // canonical representation + 0xedb000..=0xedffff => self.0, + // split representation + 0x800000..=0xbfffff => self.0 | 0xb000, + _ => 0, + }; + NonZeroU16::new(surrogate_value as u16).map(LowSurrogate) } - /// Returns the numeric value of the code point. - #[inline] - pub fn to_u32(&self) -> u32 { - self.value + /// Extracts a WTF-16 code unit from the 3-byte sequence. + fn as_code_unit(self) -> u16 { + (match self.0 { + 0xf00000..=0xffffffff => { + (self.0 >> 4 & 3 | self.0 >> 6 & 0xfc | self.0 >> 8 & 0x700) + 0xd7c0 + } + 0x800000..=0xbfffff => self.0 & 0x3f | self.0 >> 2 & 0x3c0 | 0xdc00, + _ => self.0 & 0x3f | self.0 >> 2 & 0xfc0 | self.0 >> 4 & 0xf000, + }) as u16 } - /// Optionally returns a Unicode scalar value for the code point. - /// - /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF). - #[inline] - pub fn to_char(&self) -> Option<char> { - match self.value { - 0xD800 ..= 0xDFFF => None, - _ => Some(unsafe { char::from_u32_unchecked(self.value) }) - } + /// Constructs a 3-byte sequence from the bytes. + pub(super) fn new(input: &[u8]) -> Self { + assert!(input.len() >= 3); + ThreeByteSeq((input[0] as u32) << 16 | (input[1] as u32) << 8 | (input[2] as u32)) } - /// Returns a Unicode scalar value for the code point. - /// - /// Returns `'\u{FFFD}'` (the replacement character “�”) - /// if the code point is a surrogate (from U+D800 to U+DFFF). - #[inline] - pub fn to_char_lossy(&self) -> char { - self.to_char().unwrap_or('\u{FFFD}') + pub(super) fn value(self) -> u32 { + self.0 } } @@ -110,7 +197,7 @@ impl CodePoint { /// /// Similar to `String`, but can additionally contain surrogate code points /// if they’re not in a surrogate pair. -#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] +#[derive(Default, Clone)] pub struct Wtf8Buf { bytes: Vec<u8> } @@ -123,12 +210,6 @@ impl ops::Deref for Wtf8Buf { } } -impl ops::DerefMut for Wtf8Buf { - fn deref_mut(&mut self) -> &mut Wtf8 { - self.as_mut_slice() - } -} - /// Format the string with double quotes, /// and surrogates as `\u` followed by four hexadecimal digits. /// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800] @@ -139,17 +220,25 @@ impl fmt::Debug for Wtf8Buf { } } +impl fmt::Display for Wtf8Buf { + #[inline] + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&**self, formatter) + } +} + impl Wtf8Buf { /// Creates a new, empty WTF-8 string. + #[cfg(test)] #[inline] - pub fn new() -> Wtf8Buf { - Wtf8Buf { bytes: Vec::new() } + pub fn new() -> Self { + Self { bytes: Vec::new() } } /// Creates a new, empty WTF-8 string with pre-allocated capacity for `n` bytes. #[inline] - pub fn with_capacity(n: usize) -> Wtf8Buf { - Wtf8Buf { bytes: Vec::with_capacity(n) } + pub fn with_capacity(n: usize) -> Self { + Self { bytes: Vec::with_capacity(n) } } /// Creates a WTF-8 string from a UTF-8 `String`. @@ -158,18 +247,20 @@ impl Wtf8Buf { /// /// Since WTF-8 is a superset of UTF-8, this always succeeds. #[inline] - pub fn from_string(string: String) -> Wtf8Buf { - Wtf8Buf { bytes: string.into_bytes() } + pub fn from_string(string: String) -> Self { + Self { bytes: string.into_bytes() } } + /// Creates a WTF-8 string from a UTF-8 `&str` slice. /// /// This copies the content of the slice. /// /// Since WTF-8 is a superset of UTF-8, this always succeeds. + #[cfg(test)] #[inline] - pub fn from_str(str: &str) -> Wtf8Buf { - Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()) } + pub fn from_str(str: &str) -> Self { + Self { bytes: <[_]>::to_vec(str.as_bytes()) } } pub fn clear(&mut self) { @@ -180,35 +271,60 @@ impl Wtf8Buf { /// /// This is lossless: calling `.encode_wide()` on the resulting string /// will always return the original code units. - pub fn from_wide(v: &[u16]) -> Wtf8Buf { - let mut string = Wtf8Buf::with_capacity(v.len()); - for item in char::decode_utf16(v.iter().cloned()) { - match item { - Ok(ch) => string.push_char(ch), - Err(surrogate) => { - let surrogate = surrogate.unpaired_surrogate(); - // Surrogates are known to be in the code point range. - let code_point = unsafe { - CodePoint::from_u32_unchecked(surrogate as u32) - }; - // Skip the WTF-8 concatenation check, - // surrogate pairs are already decoded by decode_utf16 - string.push_code_point_unchecked(code_point) + pub fn from_wide(ucs2: &[u16]) -> Self { + fn encode_unit(buf: &mut Vec<u8>, c: u16) { + match c { + 0..=0x7f => { + buf.push(c as u8); + } + 0x80..=0x7ff => { + buf.push((c >> 6 | 0xc0) as u8); + buf.push((c & 0x3f | 0x80) as u8); + } + _ => { + buf.push((c >> 12 | 0xe0) as u8); + buf.push((c >> 6 & 0x3f | 0x80) as u8); + buf.push((c & 0x3f | 0x80) as u8); } } } - string - } - /// Copied from String::push - /// This does **not** include the WTF-8 concatenation check. - fn push_code_point_unchecked(&mut self, code_point: CodePoint) { - let c = unsafe { - char::from_u32_unchecked(code_point.value) - }; - let mut bytes = [0; 4]; - let bytes = c.encode_utf8(&mut bytes).as_bytes(); - self.bytes.extend_from_slice(bytes) + let mut buf = Vec::with_capacity(ucs2.len()); + let mut it = ucs2.iter().fuse().cloned(); + + 'outer: while let Some(mut c1) = it.next() { + if let 0xd800..=0xdbff = c1 { + // we've got a high surrogate. check if it is followed by a + // low surrogate. + while let Some(c2) = it.next() { + match c2 { + 0xd800..=0xdbff => { + // we've got another high surrogate, keep checking + encode_unit(&mut buf, c1); + c1 = c2; + } + 0xdc00..=0xdfff => { + // we've got a low surrogate, write a 4-byte sequence. + let c = ((c1 as u32 & 0x3ff) << 10 | (c2 as u32 & 0x3ff)) + 0x1_0000; + buf.push((c >> 18 | 0xf0) as u8); + buf.push((c >> 12 & 0x3f | 0x80) as u8); + buf.push((c >> 6 & 0x3f | 0x80) as u8); + buf.push((c & 0x3f | 0x80) as u8); + continue 'outer; + } + _ => { + // we've got an unpaired surrogate. + encode_unit(&mut buf, c1); + encode_unit(&mut buf, c2); + continue 'outer; + } + } + } + } + encode_unit(&mut buf, c1); + } + + Self { bytes: buf } } #[inline] @@ -216,11 +332,6 @@ impl Wtf8Buf { unsafe { Wtf8::from_bytes_unchecked(&self.bytes) } } - #[inline] - pub fn as_mut_slice(&mut self) -> &mut Wtf8 { - unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) } - } - /// Reserves capacity for at least `additional` more bytes to be inserted /// in the given `Wtf8Buf`. /// The collection may reserve more space to avoid frequent reallocations. @@ -255,6 +366,7 @@ impl Wtf8Buf { } /// Append a UTF-8 slice at the end of the string. + #[cfg(test)] #[inline] pub fn push_str(&mut self, other: &str) { self.bytes.extend_from_slice(other.as_bytes()) @@ -267,45 +379,22 @@ impl Wtf8Buf { /// like concatenating ill-formed UTF-16 strings effectively would. #[inline] pub fn push_wtf8(&mut self, other: &Wtf8) { - match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) { - // Replace newly paired surrogates by a supplementary code point. - (Some(lead), Some(trail)) => { - let len_without_lead_surrogate = self.len() - 3; - self.bytes.truncate(len_without_lead_surrogate); - let other_without_trail_surrogate = &other.bytes[3..]; - // 4 bytes for the supplementary code point - self.bytes.reserve(4 + other_without_trail_surrogate.len()); - self.push_char(decode_surrogate_pair(lead, trail)); - self.bytes.extend_from_slice(other_without_trail_surrogate); - } - _ => self.bytes.extend_from_slice(&other.bytes) - } - } - - /// Append a Unicode scalar value at the end of the string. - #[inline] - pub fn push_char(&mut self, c: char) { - self.push_code_point_unchecked(CodePoint::from_char(c)) - } + let mut a = &**self; + let mut b = other; - /// Append a code point at the end of the string. - /// - /// This replaces newly paired surrogates at the boundary - /// with a supplementary code point, - /// like concatenating ill-formed UTF-16 strings effectively would. - #[inline] - pub fn push(&mut self, code_point: CodePoint) { - if let trail @ 0xDC00..=0xDFFF = code_point.to_u32() { - if let Some(lead) = (&*self).final_lead_surrogate() { - let len_without_lead_surrogate = self.len() - 3; - self.bytes.truncate(len_without_lead_surrogate); - self.push_char(decode_surrogate_pair(lead, trail as u16)); - return + if let Some(hi) = a.split_off_last_high_surrogate() { + if let Some(lo) = b.split_off_first_low_surrogate() { + let len_without_high_surrogate = self.len() - 3; + self.bytes.truncate(len_without_high_surrogate); + // 4 bytes for the supplementary code point + self.bytes.reserve(4 + b.len()); + self.bytes.extend_from_slice(&decode_surrogate_pair(hi, lo)); + self.bytes.extend_from_slice(&b.bytes); + return; } } - // No newly paired surrogates at the boundary. - self.push_code_point_unchecked(code_point) + self.bytes.extend_from_slice(&b.bytes); } /// Shortens a string to the specified length. @@ -314,10 +403,19 @@ impl Wtf8Buf { /// /// Panics if `new_len` > current length, /// or if `new_len` is not a code point boundary. + #[cfg(test)] #[inline] - pub fn truncate(&mut self, new_len: usize) { - assert!(is_code_point_boundary(self, new_len)); - self.bytes.truncate(new_len) + pub fn truncate(&mut self, mut new_len: usize) { + match classify_index(self, new_len) { + IndexType::FourByteSeq2 => new_len += 1, + IndexType::CharBoundary => {} + _ => slice_error_fail(self, 0, new_len), + }; + self.bytes.truncate(new_len); + } + + pub fn make_ascii_uppercase(&mut self) { + self.bytes.make_ascii_uppercase() } /// Consumes the WTF-8 string and tries to convert it to UTF-8. @@ -327,7 +425,8 @@ impl Wtf8Buf { /// If the contents are not well-formed UTF-8 /// (that is, if the string contains surrogates), /// the original WTF-8 string is returned instead. - pub fn into_string(self) -> Result<String, Wtf8Buf> { + #[inline] + pub fn into_string(self) -> Result<String, Self> { match self.next_surrogate(0) { None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }), Some(_) => Err(self), @@ -339,6 +438,7 @@ impl Wtf8Buf { /// This does not copy the data (but may overwrite parts of it in place). /// /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”) + #[cfg(test)] pub fn into_string_lossy(mut self) -> String { let mut pos = 0; loop { @@ -354,41 +454,14 @@ impl Wtf8Buf { } /// Converts this `Wtf8Buf` into a boxed `Wtf8`. - #[inline] pub fn into_box(self) -> Box<Wtf8> { - unsafe { mem::transmute(self.bytes.into_boxed_slice()) } + unsafe { Box::from_raw(Box::into_raw(self.bytes.into_boxed_slice()) as *mut Wtf8) } } /// Converts a `Box<Wtf8>` into a `Wtf8Buf`. - pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf { - let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) }; - Wtf8Buf { bytes: bytes.into_vec() } - } -} - -/// Creates a new WTF-8 string from an iterator of code points. -/// -/// This replaces surrogate code point pairs with supplementary code points, -/// like concatenating ill-formed UTF-16 strings effectively would. -impl FromIterator<CodePoint> for Wtf8Buf { - fn from_iter<T: IntoIterator<Item=CodePoint>>(iter: T) -> Wtf8Buf { - let mut string = Wtf8Buf::new(); - string.extend(iter); - string - } -} - -/// Append code points from an iterator to the string. -/// -/// This replaces surrogate code point pairs with supplementary code points, -/// like concatenating ill-formed UTF-16 strings effectively would. -impl Extend<CodePoint> for Wtf8Buf { - fn extend<T: IntoIterator<Item=CodePoint>>(&mut self, iter: T) { - let iterator = iter.into_iter(); - let (low, _high) = iterator.size_hint(); - // Lower bound of one byte per code point (ASCII only) - self.bytes.reserve(low); - iterator.for_each(move |code_point| self.push(code_point)); + pub fn from_box(boxed: Box<Wtf8>) -> Self { + let bytes = unsafe { Box::from_raw(Box::into_raw(boxed) as *mut [u8]) }; + Self { bytes: bytes.into_vec() } } } @@ -396,7 +469,6 @@ impl Extend<CodePoint> for Wtf8Buf { /// /// Similar to `&str`, but can additionally contain surrogate code points /// if they’re not in a surrogate pair. -#[derive(Eq, Ord, PartialEq, PartialOrd)] pub struct Wtf8 { bytes: [u8] } @@ -480,16 +552,7 @@ impl Wtf8 { /// Since the byte slice is not checked for valid WTF-8, this functions is /// marked unsafe. #[inline] - unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 { - mem::transmute(value) - } - - /// Creates a mutable WTF-8 slice from a mutable WTF-8 byte slice. - /// - /// Since the byte slice is not checked for valid WTF-8, this functions is - /// marked unsafe. - #[inline] - unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 { + pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 { mem::transmute(value) } @@ -510,6 +573,7 @@ impl Wtf8 { /// # Panics /// /// Panics if `position` is beyond the end of the string. + #[cfg(test)] #[inline] pub fn ascii_byte_at(&self, position: usize) -> u8 { match self.bytes[position] { @@ -518,12 +582,6 @@ impl Wtf8 { } } - /// Returns an iterator for the string’s code points. - #[inline] - pub fn code_points(&self) -> Wtf8CodePoints<'_> { - Wtf8CodePoints { bytes: self.bytes.iter() } - } - /// Tries to convert the string to UTF-8 and return a `&str` slice. /// /// Returns `None` if the string contains surrogates. @@ -578,89 +636,143 @@ impl Wtf8 { /// would always return the original WTF-8 string. #[inline] pub fn encode_wide(&self) -> EncodeWide<'_> { - EncodeWide { code_points: self.code_points(), extra: 0 } + let ptr = self.bytes.as_ptr(); + let end = unsafe { ptr.add(self.bytes.len()) }; + EncodeWide { ptr, end, _marker: PhantomData } } #[inline] fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> { - let mut iter = self.bytes[pos..].iter(); loop { - let b = *iter.next()?; - if b < 0x80 { - pos += 1; - } else if b < 0xE0 { - iter.next(); - pos += 2; - } else if b == 0xED { - match (iter.next(), iter.next()) { - (Some(&b2), Some(&b3)) if b2 >= 0xA0 => { - return Some((pos, decode_surrogate(b2, b3))) - } - _ => pos += 3 - } - } else if b < 0xF0 { - iter.next(); - iter.next(); - pos += 3; - } else { - iter.next(); - iter.next(); - iter.next(); - pos += 4; - } + let inc = match *self.bytes.get(pos)? { + 0..=0x7f => 1, + 0x80..=0xbf => break, + 0xc0..=0xdf => 2, + b @ 0xe0..=0xef => if b == 0xed && self.bytes[pos + 1] >= 0xa0 { break } else { 3 }, + 0xf0..=0xff => if self.len() == pos + 3 { break } else { 4 }, + }; + pos += inc; } + Some((pos, ThreeByteSeq::new(&self.bytes[pos..]).as_code_unit())) } - #[inline] - fn final_lead_surrogate(&self) -> Option<u16> { - let len = self.len(); - if len < 3 { - return None - } - match &self.bytes[(len - 3)..] { - &[0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)), - _ => None - } + /// Splits-off the first low surrogate from the string. + fn split_off_first_low_surrogate(self: &mut &Self) -> Option<LowSurrogate> { + let input = self.bytes.get(..3)?; + let res = ThreeByteSeq::new(input).to_low_surrogate()?; + *self = unsafe { Self::from_bytes_unchecked(&self.bytes[3..]) }; + Some(res) } - #[inline] - fn initial_trail_surrogate(&self) -> Option<u16> { - let len = self.len(); + /// Splits-off the last high surrogate from the string. + fn split_off_last_high_surrogate(self: &mut &Self) -> Option<HighSurrogate> { + let e = self.len().checked_sub(3)?; + let res = ThreeByteSeq::new(&self.bytes[e..]).to_high_surrogate()?; + *self = unsafe { Self::from_bytes_unchecked(&self.bytes[..e]) }; + Some(res) + } + + /// Split the string into three parts: the beginning low surrogate, the + /// well-formed WTF-8 string in the middle, and the ending high surrogate. + pub(super) fn canonicalize(&self) -> (Option<LowSurrogate>, &[u8], Option<HighSurrogate>) { + let mut s = self; + let low = s.split_off_first_low_surrogate(); + let high = s.split_off_last_high_surrogate(); + (low, &s.bytes, high) + } + + fn canonicalize_in_place(bytes: &mut [u8]) { + let len = bytes.len(); if len < 3 { - return None + return; } - match &self.bytes[..3] { - &[0xED, b2 @ 0xB0..=0xBF, b3] => Some(decode_surrogate(b2, b3)), - _ => None + // first 3 bytes form a low surrogate + // (this check is a faster version of `(0x80..0xc0).contains(_)`). + if (bytes[0] as i8) < -0x40 { + bytes[0] = 0xed; + bytes[1] |= 0x30; + } + // last 3 bytes form a high surrogate + if bytes[len - 3] >= 0xf0 { + let cu = ThreeByteSeq::new(&bytes[(len - 3)..]) + .to_high_surrogate_from_split_repr_unchecked(); + bytes[len - 3] = 0xed; + bytes[len - 2] = (cu >> 8) as u8; + bytes[len - 1] = cu as u8; } - } - - /// Boxes this `Wtf8`. - #[inline] - pub fn into_box(&self) -> Box<Wtf8> { - let boxed: Box<[u8]> = self.bytes.into(); - unsafe { mem::transmute(boxed) } } /// Creates a boxed, empty `Wtf8`. + #[inline] pub fn empty_box() -> Box<Wtf8> { let boxed: Box<[u8]> = Default::default(); - unsafe { mem::transmute(boxed) } + unsafe { Box::from_raw(Box::into_raw(boxed) as *mut Wtf8) } + } + + #[inline] + pub fn into_box(&self) -> Box<Wtf8> { + let mut boxed: Box<[u8]> = Box::from(&self.bytes); + Wtf8::canonicalize_in_place(&mut *boxed); + unsafe { Box::from_raw(Box::into_raw(boxed) as *mut Wtf8) } } #[inline] pub fn into_arc(&self) -> Arc<Wtf8> { let arc: Arc<[u8]> = Arc::from(&self.bytes); - unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) } + let raw = Arc::into_raw(arc); + unsafe { + Wtf8::canonicalize_in_place(&mut *(raw as *mut [u8])); + // safe, we haven't shared the Arc yet. + Arc::from_raw(raw as *mut Wtf8) + } } #[inline] pub fn into_rc(&self) -> Rc<Wtf8> { let rc: Rc<[u8]> = Rc::from(&self.bytes); - unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) } + let raw = Rc::into_raw(rc); + unsafe { + Wtf8::canonicalize_in_place(&mut *(raw as *mut [u8])); + // safe, we haven't shared the Rc yet. + Rc::from_raw(raw as *mut Wtf8) + } } } +// FIXME: Comparing Option<Surrogate> is not fully optimized yet #49892. + +impl PartialEq for Wtf8 { + fn eq(&self, other: &Self) -> bool { + self.canonicalize() == other.canonicalize() + } + fn ne(&self, other: &Self) -> bool { + self.canonicalize() != other.canonicalize() + } +} +impl Eq for Wtf8 {} + +impl PartialOrd for Wtf8 { + fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> { + self.canonicalize().partial_cmp(&other.canonicalize()) + } + fn lt(&self, other: &Self) -> bool { + self.canonicalize() < other.canonicalize() + } + fn le(&self, other: &Self) -> bool { + self.canonicalize() <= other.canonicalize() + } + fn gt(&self, other: &Self) -> bool { + self.canonicalize() > other.canonicalize() + } + fn ge(&self, other: &Self) -> bool { + self.canonicalize() >= other.canonicalize() + } +} +impl Ord for Wtf8 { + fn cmp(&self, other: &Self) -> cmp::Ordering { + self.canonicalize().cmp(&other.canonicalize()) + } +} /// Returns a slice of the given string for the byte range [`begin`..`end`). /// @@ -672,15 +784,21 @@ impl ops::Index<ops::Range<usize>> for Wtf8 { type Output = Wtf8; #[inline] - fn index(&self, range: ops::Range<usize>) -> &Wtf8 { - // is_code_point_boundary checks that the index is in [0, .len()] - if range.start <= range.end && - is_code_point_boundary(self, range.start) && - is_code_point_boundary(self, range.end) { - unsafe { slice_unchecked(self, range.start, range.end) } - } else { - slice_error_fail(self, range.start, range.end) + fn index(&self, mut range: ops::Range<usize>) -> &Wtf8 { + if range.start == range.end { + return Self::from_str(""); } + match classify_index(self, range.start) { + IndexType::FourByteSeq2 => range.start -= 1, + IndexType::CharBoundary => {} + _ => slice_error_fail(self, range.start, range.end), + }; + match classify_index(self, range.end) { + IndexType::FourByteSeq2 => range.end += 1, + IndexType::CharBoundary => {} + _ => slice_error_fail(self, range.start, range.end), + }; + unsafe { slice_unchecked(self, range.start, range.end) } } } @@ -694,13 +812,13 @@ impl ops::Index<ops::RangeFrom<usize>> for Wtf8 { type Output = Wtf8; #[inline] - fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 { - // is_code_point_boundary checks that the index is in [0, .len()] - if is_code_point_boundary(self, range.start) { - unsafe { slice_unchecked(self, range.start, self.len()) } - } else { - slice_error_fail(self, range.start, self.len()) - } + fn index(&self, mut range: ops::RangeFrom<usize>) -> &Wtf8 { + match classify_index(self, range.start) { + IndexType::FourByteSeq2 => range.start -= 1, + IndexType::CharBoundary => {} + _ => slice_error_fail(self, range.start, self.len()), + }; + unsafe { slice_unchecked(self, range.start, self.len()) } } } @@ -714,13 +832,13 @@ impl ops::Index<ops::RangeTo<usize>> for Wtf8 { type Output = Wtf8; #[inline] - fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 { - // is_code_point_boundary checks that the index is in [0, .len()] - if is_code_point_boundary(self, range.end) { - unsafe { slice_unchecked(self, 0, range.end) } - } else { - slice_error_fail(self, 0, range.end) - } + fn index(&self, mut range: ops::RangeTo<usize>) -> &Wtf8 { + match classify_index(self, range.end) { + IndexType::FourByteSeq2 => range.end += 1, + IndexType::CharBoundary => {} + _ => slice_error_fail(self, 0, range.end), + }; + unsafe { slice_unchecked(self, 0, range.end) } } } @@ -733,25 +851,52 @@ impl ops::Index<ops::RangeFull> for Wtf8 { } } -#[inline] -fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 { - // The first byte is assumed to be 0xED - 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F +/// Type of an index in an OMG-WTF-8 string. +#[derive(Copy, Clone, PartialEq, Eq, Debug)] +#[repr(u8)] +enum IndexType { + /// Boundary of a WTF-8 character sequence. + CharBoundary = 0, + /// Byte 1 in a 4-byte sequence. + FourByteSeq1 = 1, + /// Byte 2 in a 4-byte sequence. + FourByteSeq2 = 2, + /// Byte 3 in a 4-byte sequence. + FourByteSeq3 = 3, + /// Pointing inside a 2- or 3-byte sequence. + Interior = 4, + /// Out of bounds. + OutOfBounds = 5, } -#[inline] -fn decode_surrogate_pair(lead: u16, trail: u16) -> char { - let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32); - unsafe { char::from_u32_unchecked(code_point) } -} - -/// Copied from core::str::StrPrelude::is_char_boundary -#[inline] -pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { - if index == slice.len() { return true; } - match slice.bytes.get(index) { - None => false, - Some(&b) => b < 128 || b >= 192, +/// Classifies the kind of index in this string. +fn classify_index(slice: &Wtf8, index: usize) -> IndexType { + let slice = &slice.bytes; + let len = slice.len(); + if index == 0 || index == len { + return IndexType::CharBoundary; + } + match slice.get(index) { + Some(0x80..=0xbf) => { + let max_offset = index.min(3); + let min_offset = (index + 3).saturating_sub(len); + for offset in min_offset..max_offset { + let offset = offset + 1; + unsafe { + if slice.get_unchecked(index - offset) >= &0xf0 { + return match offset as u8 { + 1 => IndexType::FourByteSeq1, + 2 => IndexType::FourByteSeq2, + 3 => IndexType::FourByteSeq3, + _ => crate::hint::unreachable_unchecked(), + }; + } + } + } + IndexType::Interior + } + Some(_) => IndexType::CharBoundary, + None => IndexType::OutOfBounds, } } @@ -759,10 +904,8 @@ pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { #[inline] pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { // memory layout of an &[u8] and &Wtf8 are the same - Wtf8::from_bytes_unchecked(slice::from_raw_parts( - s.bytes.as_ptr().add(begin), - end - begin - )) + assert!(begin <= end); + Wtf8::from_bytes_unchecked(s.bytes.get_unchecked(begin..end)) } /// Copied from core::str::raw::slice_error_fail @@ -773,35 +916,18 @@ pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { begin, end, s); } -/// Iterator for the code points of a WTF-8 string. -/// -/// Created with the method `.code_points()`. -#[derive(Clone)] -pub struct Wtf8CodePoints<'a> { - bytes: slice::Iter<'a, u8> -} - -impl<'a> Iterator for Wtf8CodePoints<'a> { - type Item = CodePoint; - - #[inline] - fn next(&mut self) -> Option<CodePoint> { - next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - let len = self.bytes.len(); - (len.saturating_add(3) / 4, Some(len)) - } -} - /// Generates a wide character sequence for potentially ill-formed UTF-16. #[stable(feature = "rust1", since = "1.0.0")] #[derive(Clone)] pub struct EncodeWide<'a> { - code_points: Wtf8CodePoints<'a>, - extra: u16 + ptr: *const u8, + end: *const u8, + _marker: PhantomData<&'a u8>, +} + +#[inline] +fn code_unit_from_two_byte_seq(c: u8, d: u8) -> u16 { + ((c as u16) & 0x1f) << 6 | ((d as u16) & 0x3f) } // Copied from libunicode/u_str.rs @@ -811,111 +937,122 @@ impl<'a> Iterator for EncodeWide<'a> { #[inline] fn next(&mut self) -> Option<u16> { - if self.extra != 0 { - let tmp = self.extra; - self.extra = 0; - return Some(tmp); + if self.ptr == self.end { + return None; } - let mut buf = [0; 2]; - self.code_points.next().map(|code_point| { - let c = unsafe { - char::from_u32_unchecked(code_point.value) - }; - let n = c.encode_utf16(&mut buf).len(); - if n == 2 { - self.extra = buf[1]; + unsafe { + let c = *self.ptr; + match c { + 0x00..=0x7f => { + self.ptr = self.ptr.offset(1); + Some(c as u16) + } + 0x80..=0xbf | 0xe0..=0xff => { + let tbs = ThreeByteSeq::new(slice::from_raw_parts(self.ptr, 3)); + let mut new_ptr = self.ptr.offset(3); + if c >= 0xf0 && new_ptr != self.end { + new_ptr = self.ptr.offset(1); + } + self.ptr = new_ptr; + Some(tbs.as_code_unit()) + } + 0xc0..=0xdf => { + let d = *self.ptr.offset(1); + self.ptr = self.ptr.offset(2); + Some(code_unit_from_two_byte_seq(c, d)) + } } - buf[0] - }) + } } #[inline] fn size_hint(&self) -> (usize, Option<usize>) { - let (low, high) = self.code_points.size_hint(); - // every code point gets either one u16 or two u16, - // so this iterator is between 1 or 2 times as - // long as the underlying iterator. - (low, high.and_then(|n| n.checked_mul(2))) + // converting from WTF-8 to WTF-16: + // 1-byte seq => 1 code unit (1x) + // 2-byte seq => 1 code unit (0.5x) + // 3-byte seq => 1 code unit (0.33x) + // 4-byte seq => 2 code units (0.5x) + // + // thus the lower-limit is everything being a 3-byte seq (= ceil(len/3)) + // and upper-limit is everything being 1-byte seq (= len). + let len = unsafe { self.end.offset_from(self.ptr) as usize }; + (len.saturating_add(2) / 3, Some(len)) } } -impl Hash for CodePoint { +#[stable(feature = "double_ended_encode_wide", since = "1.33.0")] +impl<'a> DoubleEndedIterator for EncodeWide<'a> { #[inline] - fn hash<H: Hasher>(&self, state: &mut H) { - self.value.hash(state) + fn next_back(&mut self) -> Option<u16> { + if self.ptr == self.end { + return None; + } + unsafe { + let last = self.end.offset(-1); + let d = *last; + if d < 0x80 { + self.end = last; + return Some(d as u16); + } + + let last_2 = self.end.offset(-2); + let c = *last_2; + if 0xc0 <= c && c < 0xe0 { + self.end = last_2; + return Some(code_unit_from_two_byte_seq(c, d)); + } + + let mut new_end = self.end.offset(-3); + let tbs = ThreeByteSeq::new(slice::from_raw_parts(new_end, 3)); + if *new_end < 0xc0 && self.ptr != new_end { + new_end = last; + } + self.end = new_end; + Some(tbs.as_code_unit()) + } } } impl Hash for Wtf8Buf { #[inline] fn hash<H: Hasher>(&self, state: &mut H) { - state.write(&self.bytes); - 0xfeu8.hash(state) + (**self).hash(state) } } impl Hash for Wtf8 { #[inline] fn hash<H: Hasher>(&self, state: &mut H) { - state.write(&self.bytes); + let (left, middle, right) = self.canonicalize(); + if let Some(low) = left { + state.write(&low.decode()); + } + state.write(middle); + if let Some(high) = right { + state.write(&high.decode()); + } 0xfeu8.hash(state) } } -impl Wtf8 { - pub fn make_ascii_uppercase(&mut self) { self.bytes.make_ascii_uppercase() } +impl PartialEq for Wtf8Buf { + #[inline] + fn eq(&self, other: &Self) -> bool { + **self == **other + } + #[inline] + fn ne(&self, other: &Self) -> bool { + **self != **other + } } +impl Eq for Wtf8Buf {} + #[cfg(test)] mod tests { - use crate::borrow::Cow; use super::*; - #[test] - fn code_point_from_u32() { - assert!(CodePoint::from_u32(0).is_some()); - assert!(CodePoint::from_u32(0xD800).is_some()); - assert!(CodePoint::from_u32(0x10FFFF).is_some()); - assert!(CodePoint::from_u32(0x110000).is_none()); - } - - #[test] - fn code_point_to_u32() { - fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } - assert_eq!(c(0).to_u32(), 0); - assert_eq!(c(0xD800).to_u32(), 0xD800); - assert_eq!(c(0x10FFFF).to_u32(), 0x10FFFF); - } - - #[test] - fn code_point_from_char() { - assert_eq!(CodePoint::from_char('a').to_u32(), 0x61); - assert_eq!(CodePoint::from_char('💩').to_u32(), 0x1F4A9); - } - - #[test] - fn code_point_to_string() { - assert_eq!(format!("{:?}", CodePoint::from_char('a')), "U+0061"); - assert_eq!(format!("{:?}", CodePoint::from_char('💩')), "U+1F4A9"); - } - - #[test] - fn code_point_to_char() { - fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } - assert_eq!(c(0x61).to_char(), Some('a')); - assert_eq!(c(0x1F4A9).to_char(), Some('💩')); - assert_eq!(c(0xD800).to_char(), None); - } - - #[test] - fn code_point_to_char_lossy() { - fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } - assert_eq!(c(0x61).to_char_lossy(), 'a'); - assert_eq!(c(0x1F4A9).to_char_lossy(), '💩'); - assert_eq!(c(0xD800).to_char_lossy(), '\u{FFFD}'); - } - #[test] fn wtf8buf_new() { assert_eq!(Wtf8Buf::new().bytes, b""); @@ -924,23 +1061,25 @@ mod tests { #[test] fn wtf8buf_from_str() { assert_eq!(Wtf8Buf::from_str("").bytes, b""); - assert_eq!(Wtf8Buf::from_str("aé 💩").bytes, - b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert_eq!(Wtf8Buf::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); } #[test] fn wtf8buf_from_string() { assert_eq!(Wtf8Buf::from_string(String::from("")).bytes, b""); - assert_eq!(Wtf8Buf::from_string(String::from("aé 💩")).bytes, - b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert_eq!( + Wtf8Buf::from_string(String::from("aé 💩")).bytes, + b"a\xC3\xA9 \xF0\x9F\x92\xA9", + ); } #[test] fn wtf8buf_from_wide() { assert_eq!(Wtf8Buf::from_wide(&[]).bytes, b""); - assert_eq!(Wtf8Buf::from_wide( - &[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]).bytes, - b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9"); + assert_eq!( + Wtf8Buf::from_wide(&[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]).bytes, + b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9", + ); } #[test] @@ -951,59 +1090,6 @@ mod tests { assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); } - #[test] - fn wtf8buf_push_char() { - let mut string = Wtf8Buf::from_str("aé "); - assert_eq!(string.bytes, b"a\xC3\xA9 "); - string.push_char('💩'); - assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); - } - - #[test] - fn wtf8buf_push() { - let mut string = Wtf8Buf::from_str("aé "); - assert_eq!(string.bytes, b"a\xC3\xA9 "); - string.push(CodePoint::from_char('💩')); - assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); - - fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } - - let mut string = Wtf8Buf::new(); - string.push(c(0xD83D)); // lead - string.push(c(0xDCA9)); // trail - assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! - - let mut string = Wtf8Buf::new(); - string.push(c(0xD83D)); // lead - string.push(c(0x20)); // not surrogate - string.push(c(0xDCA9)); // trail - assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); - - let mut string = Wtf8Buf::new(); - string.push(c(0xD800)); // lead - string.push(c(0xDBFF)); // lead - assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); - - let mut string = Wtf8Buf::new(); - string.push(c(0xD800)); // lead - string.push(c(0xE000)); // not surrogate - assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); - - let mut string = Wtf8Buf::new(); - string.push(c(0xD7FF)); // not surrogate - string.push(c(0xDC00)); // trail - assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); - - let mut string = Wtf8Buf::new(); - string.push(c(0x61)); // not surrogate, < 3 bytes - string.push(c(0xDC00)); // trail - assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); - - let mut string = Wtf8Buf::new(); - string.push(c(0xDC00)); // trail - assert_eq!(string.bytes, b"\xED\xB0\x80"); - } - #[test] fn wtf8buf_push_wtf8() { let mut string = Wtf8Buf::from_str("aé"); @@ -1074,7 +1160,7 @@ mod tests { fn wtf8buf_into_string() { let mut string = Wtf8Buf::from_str("aé 💩"); assert_eq!(string.clone().into_string(), Ok(String::from("aé 💩"))); - string.push(CodePoint::from_u32(0xD800).unwrap()); + string.push_wtf8(unsafe { Wtf8::from_bytes_unchecked(&[0xed, 0xa0, 0x80]) }); assert_eq!(string.clone().into_string(), Err(string)); } @@ -1082,51 +1168,14 @@ mod tests { fn wtf8buf_into_string_lossy() { let mut string = Wtf8Buf::from_str("aé 💩"); assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩")); - string.push(CodePoint::from_u32(0xD800).unwrap()); + string.push_wtf8(unsafe { Wtf8::from_bytes_unchecked(&[0xed, 0xa0, 0x80]) }); assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�")); } - #[test] - fn wtf8buf_from_iterator() { - fn f(values: &[u32]) -> Wtf8Buf { - values.iter().map(|&c| CodePoint::from_u32(c).unwrap()).collect::<Wtf8Buf>() - } - assert_eq!(f(&[0x61, 0xE9, 0x20, 0x1F4A9]).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); - - assert_eq!(f(&[0xD83D, 0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! - assert_eq!(f(&[0xD83D, 0x20, 0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); - assert_eq!(f(&[0xD800, 0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF"); - assert_eq!(f(&[0xD800, 0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80"); - assert_eq!(f(&[0xD7FF, 0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80"); - assert_eq!(f(&[0x61, 0xDC00]).bytes, b"\x61\xED\xB0\x80"); - assert_eq!(f(&[0xDC00]).bytes, b"\xED\xB0\x80"); - } - - #[test] - fn wtf8buf_extend() { - fn e(initial: &[u32], extended: &[u32]) -> Wtf8Buf { - fn c(value: &u32) -> CodePoint { CodePoint::from_u32(*value).unwrap() } - let mut string = initial.iter().map(c).collect::<Wtf8Buf>(); - string.extend(extended.iter().map(c)); - string - } - - assert_eq!(e(&[0x61, 0xE9], &[0x20, 0x1F4A9]).bytes, - b"a\xC3\xA9 \xF0\x9F\x92\xA9"); - - assert_eq!(e(&[0xD83D], &[0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! - assert_eq!(e(&[0xD83D, 0x20], &[0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); - assert_eq!(e(&[0xD800], &[0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF"); - assert_eq!(e(&[0xD800], &[0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80"); - assert_eq!(e(&[0xD7FF], &[0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80"); - assert_eq!(e(&[0x61], &[0xDC00]).bytes, b"\x61\xED\xB0\x80"); - assert_eq!(e(&[], &[0xDC00]).bytes, b"\xED\xB0\x80"); - } - #[test] fn wtf8buf_show() { let mut string = Wtf8Buf::from_str("a\té \u{7f}💩\r"); - string.push(CodePoint::from_u32(0xD800).unwrap()); + string.push_wtf8(unsafe { Wtf8::from_bytes_unchecked(&[0xed, 0xa0, 0x80]) }); assert_eq!(format!("{:?}", string), "\"a\\té \\u{7f}\u{1f4a9}\\r\\u{d800}\""); } @@ -1159,6 +1208,22 @@ mod tests { assert_eq!(&Wtf8::from_str("aé 💩")[1.. 4].bytes, b"\xC3\xA9 "); } + #[test] + fn omgwtf8_slice() { + let s = Wtf8::from_str("😀😂😄"); + assert_eq!(&s[..].bytes, b"\xf0\x9f\x98\x80\xf0\x9f\x98\x82\xf0\x9f\x98\x84"); + assert_eq!(&s[2..].bytes, b"\x9f\x98\x80\xf0\x9f\x98\x82\xf0\x9f\x98\x84"); + assert_eq!(&s[4..].bytes, b"\xf0\x9f\x98\x82\xf0\x9f\x98\x84"); + assert_eq!(&s[..10].bytes, b"\xf0\x9f\x98\x80\xf0\x9f\x98\x82\xf0\x9f\x98"); + assert_eq!(&s[..8].bytes, b"\xf0\x9f\x98\x80\xf0\x9f\x98\x82"); + assert_eq!(&s[2..10].bytes, b"\x9f\x98\x80\xf0\x9f\x98\x82\xf0\x9f\x98"); + assert_eq!(&s[4..8].bytes, b"\xf0\x9f\x98\x82"); + assert_eq!(&s[2..4].bytes, b"\x9f\x98\x80"); + assert_eq!(&s[2..2].bytes, b""); + assert_eq!(&s[0..2].bytes, b"\xf0\x9f\x98"); + assert_eq!(&s[4..4].bytes, b""); + } + #[test] #[should_panic] fn wtf8_slice_not_code_point_boundary() { @@ -1187,6 +1252,49 @@ mod tests { &Wtf8::from_str("aé 💩")[5..]; } + #[test] + #[should_panic] + fn test_slice_into_invalid_index_split_begin_1() { + let s = unsafe { Wtf8::from_bytes_unchecked(b"\x90\x80\x80\x7e") }; + let _ = s[..1]; + } + #[test] + #[should_panic] + fn test_slice_into_invalid_index_split_begin_2() { + let s = unsafe { Wtf8::from_bytes_unchecked(b"\x90\x80\x80\x7e") }; + let _ = s[..2]; + } + #[test] + #[should_panic] + fn test_slice_into_invalid_index_split_end_1() { + let s = unsafe { Wtf8::from_bytes_unchecked(b"\x7e\xf0\x90\x80") }; + let _ = s[2..]; + } + #[test] + #[should_panic] + fn test_slice_into_invalid_index_split_end_2() { + let s = unsafe { Wtf8::from_bytes_unchecked(b"\x7e\xf0\x90\x80") }; + let _ = s[3..]; + } + #[test] + #[should_panic] + fn test_slice_into_invalid_index_canonical_1() { + let s = unsafe { Wtf8::from_bytes_unchecked(b"\xed\xaf\xbf") }; + let _ = s[1..]; + } + #[test] + #[should_panic] + fn test_slice_into_invalid_index_canonical_2() { + let s = unsafe { Wtf8::from_bytes_unchecked(b"\xed\xaf\xbf") }; + let _ = s[2..]; + } + #[test] + #[should_panic] + fn test_slice_into_invalid_index_wrong_order() { + let s = Wtf8::from_str("12345"); + let _ = s[3..1]; + } + #[test] fn wtf8_ascii_byte_at() { let slice = Wtf8::from_str("aé 💩"); @@ -1197,35 +1305,27 @@ mod tests { assert_eq!(slice.ascii_byte_at(4), b'\xFF'); } - #[test] - fn wtf8_code_points() { - fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } - fn cp(string: &Wtf8Buf) -> Vec<Option<char>> { - string.code_points().map(|c| c.to_char()).collect::<Vec<_>>() + macro_rules! check_encode_wide { + ($s:expr, $cu:expr) => { + let mut v = $cu; + assert_eq!($s.encode_wide().collect::<Vec<_>>(), v); + v.reverse(); + assert_eq!($s.encode_wide().rev().collect::<Vec<_>>(), v); } - let mut string = Wtf8Buf::from_str("é "); - assert_eq!(cp(&string), [Some('é'), Some(' ')]); - string.push(c(0xD83D)); - assert_eq!(cp(&string), [Some('é'), Some(' '), None]); - string.push(c(0xDCA9)); - assert_eq!(cp(&string), [Some('é'), Some(' '), Some('💩')]); } #[test] fn wtf8_as_str() { assert_eq!(Wtf8::from_str("").as_str(), Some("")); assert_eq!(Wtf8::from_str("aé 💩").as_str(), Some("aé 💩")); - let mut string = Wtf8Buf::new(); - string.push(CodePoint::from_u32(0xD800).unwrap()); - assert_eq!(string.as_str(), None); + assert_eq!(unsafe { Wtf8::from_bytes_unchecked(b"\xed\xa0\x80") }.as_str(), None); } #[test] fn wtf8_to_string_lossy() { assert_eq!(Wtf8::from_str("").to_string_lossy(), Cow::Borrowed("")); assert_eq!(Wtf8::from_str("aé 💩").to_string_lossy(), Cow::Borrowed("aé 💩")); - let mut string = Wtf8Buf::from_str("aé 💩"); - string.push(CodePoint::from_u32(0xD800).unwrap()); + let string = &Wtf8::from_str("aé 💩💩")[..10]; let expected: Cow<'_, str> = Cow::Owned(String::from("aé 💩�")); assert_eq!(string.to_string_lossy(), expected); } @@ -1238,18 +1338,442 @@ mod tests { assert_eq!("", d("".as_bytes())); assert_eq!("aé 💩", d("aé 💩".as_bytes())); - - let mut string = Wtf8Buf::from_str("aé 💩"); - string.push(CodePoint::from_u32(0xD800).unwrap()); - assert_eq!("aé 💩�", d(string.as_inner())); + assert_eq!("aé 💩�", d(b"a\xc3\xa9 \xf0\x9f\x92\xa9\xf0\x9f\x92")); } #[test] fn wtf8_encode_wide() { - let mut string = Wtf8Buf::from_str("aé "); - string.push(CodePoint::from_u32(0xD83D).unwrap()); - string.push_char('💩'); - assert_eq!(string.encode_wide().collect::<Vec<_>>(), - vec![0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]); + let string = unsafe { + Wtf8::from_bytes_unchecked(b"a\xc3\xa9 \xed\xa0\xbd\xf0\x9f\x92\xa9") + }; + check_encode_wide!(string, vec![0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]); + } + + #[test] + fn omgwtf8_encode_wide() { + let s = Wtf8::from_str("😀😂😄"); + check_encode_wide!(s, vec![0xd83d, 0xde00, 0xd83d, 0xde02, 0xd83d, 0xde04]); + check_encode_wide!(s[2..], vec![0xde00, 0xd83d, 0xde02, 0xd83d, 0xde04]); + check_encode_wide!(s[..10], vec![0xd83d, 0xde00, 0xd83d, 0xde02, 0xd83d]); + } + + #[test] + fn omgwtf8_eq_hash() { + use crate::collections::hash_map::DefaultHasher; + + let a = unsafe { Wtf8::from_bytes_unchecked(b"\x90\x8b\xae~\xf0\x90\x80") }; + let b = unsafe { Wtf8::from_bytes_unchecked(b"\xed\xbb\xae~\xf0\x90\x80") }; + let c = unsafe { Wtf8::from_bytes_unchecked(b"\x90\x8b\xae~\xed\xa0\x80") }; + let d = unsafe { Wtf8::from_bytes_unchecked(b"\xed\xbb\xae~\xed\xa0\x80") }; + + assert_eq!(a, b); + assert_eq!(b, c); + assert_eq!(c, d); + + fn hash<H: Hash>(a: H) -> u64 { + let mut h = DefaultHasher::new(); + a.hash(&mut h); + h.finish() + } + + assert_eq!(hash(a), hash(b)); + assert_eq!(hash(b), hash(c)); + assert_eq!(hash(c), hash(d)); + } + + #[test] + fn omgwtf8_classify_index() { + use super::IndexType::*; + + fn consume(input: &Wtf8, expected: &[IndexType]) { + let actual = (0..expected.len()).map(|i| classify_index(input, i)).collect::<Vec<_>>(); + assert_eq!(&*actual, expected); + } + consume( + Wtf8::from_str(""), + &[CharBoundary, OutOfBounds, OutOfBounds], + ); + consume( + Wtf8::from_str("aa"), + &[CharBoundary, CharBoundary, CharBoundary, OutOfBounds], + ); + consume( + Wtf8::from_str("á"), + &[CharBoundary, Interior, CharBoundary, OutOfBounds], + ); + consume( + Wtf8::from_str("\u{3000}"), + &[CharBoundary, Interior, Interior, CharBoundary, OutOfBounds], + ); + consume( + Wtf8::from_str("\u{30000}"), + &[CharBoundary, FourByteSeq1, FourByteSeq2, FourByteSeq3, CharBoundary, OutOfBounds], + ); + consume( + unsafe { Wtf8::from_bytes_unchecked(b"\xed\xbf\xbf\xed\xa0\x80") }, + &[ + CharBoundary, Interior, Interior, + CharBoundary, Interior, Interior, + CharBoundary, OutOfBounds, + ], + ); + consume( + unsafe { Wtf8::from_bytes_unchecked(b"\x90\x80\x80\xf0\x90\x80\x80\xf0\x90\x80") }, + &[ + CharBoundary, Interior, Interior, + CharBoundary, FourByteSeq1, FourByteSeq2, FourByteSeq3, + CharBoundary, Interior, Interior, + CharBoundary, OutOfBounds, + ], + ); + } +} + +unsafe impl Hay for Wtf8 { + type Index = usize; + + #[inline] + fn empty<'a>() -> &'a Self { + Wtf8::from_str("") + } + + #[inline] + fn start_index(&self) -> usize { + 0 + } + + #[inline] + fn end_index(&self) -> usize { + self.len() + } + + #[inline] + unsafe fn slice_unchecked(&self, range: Range<usize>) -> &Self { + &self[range] + } + + #[inline] + unsafe fn next_index(&self, index: usize) -> usize { + let offset = match *self.as_inner().get_unchecked(index) { + 0x00..=0x7f => 1, + 0x80..=0xbf => if index == 0 { 3 } else { 2 }, + 0xc0..=0xdf => 2, + 0xe0..=0xef => 3, + 0xf0..=0xff => if index + 3 == self.len() { 3 } else { 2 }, + }; + index + offset + } + + #[inline] + unsafe fn prev_index(&self, index: usize) -> usize { + let bytes = self.as_inner(); + let mut e = index - 1; + + let mut c = *bytes.get_unchecked(e); + if c < 0x80 { + return e; + } + e -= 1; + c = *bytes.get_unchecked(e); + if c >= 0xc0 { + return e; + } + e -= 1; + c = *bytes.get_unchecked(e); + if c < 0xc0 && e != 0 { + e += 1; + } + e + } +} + +#[test] +fn test_wtf8_next_last_index() { + let string = unsafe { Wtf8::from_bytes_unchecked(b"a\xc3\xa9 \xed\xa0\xbd\xf0\x9f\x92\xa9") }; + unsafe { + for w in [0, 1, 3, 4, 7, 9, 11].windows(2) { + let i = w[0]; + let j = w[1]; + assert_eq!(string.next_index(i), j); + assert_eq!(string.prev_index(j), i); + } + } +} + +#[derive(Debug)] +enum SurrogateType { + Split, + Canonical, + Empty, +} + +fn extend_subrange( + range: Range<usize>, + mut subrange: Range<usize>, + low_type: SurrogateType, + high_type: SurrogateType, +) -> Range<usize> { + subrange.start -= match low_type { + SurrogateType::Empty => 0, + SurrogateType::Split if subrange.start != range.start + 3 => 2, + _ => 3, + }; + subrange.end += match high_type { + SurrogateType::Empty => 0, + SurrogateType::Split if subrange.end + 3 != range.end => 2, + _ => 3, + }; + subrange +} + +#[derive(Debug, Clone)] +pub struct LowSurrogateSearcher { + canonical: u16, +} + +impl LowSurrogateSearcher { + #[inline] + fn new(ls: LowSurrogate) -> Self { + Self { + canonical: ls.value() & 0xcfff, + } + } + + #[inline] + fn is_match(&self, tbs: ThreeByteSeq) -> Option<SurrogateType> { + let tbs = tbs.value(); + if (tbs & 0xcfff) as u16 != self.canonical { + return None; + } + match tbs >> 12 { + 0xedb => Some(SurrogateType::Canonical), + 0x800..=0xbff => Some(SurrogateType::Split), + _ => None, + } + } +} + +#[derive(Debug, Clone)] +pub struct HighSurrogateSearcher { + canonical: u32, + split: u32, +} + +impl HighSurrogateSearcher { + #[inline] + fn new(hs: HighSurrogate) -> Self { + // the canonical representation + // + // c = 1010 jihg 10fe dcba + // + // rearrange + // + // c & 0xf03 = 0000 jihg 0000 00ba + // c & 0xfc = 0000 0000 00fe dc00 + // ...|...<<2 = 0000 jihg fedc 00ba + // ...+0x100 = 000K JIHG fedc 00ba + // + // rearrange again + // + // s & 0x3ff = 0000 00HG fedc 00ba + // s & 0xfc00 = 000K JI00 0000 0000 + // ...|...<<2 = 0KJI 00HG fedc 00ba + // ...|0x808 = 0KJI 10HG fedc 10ba + // + // this will be the split representation shifted right by 4 bits. + + let c = hs.value(); + let s = ((c & 0xf03) | (c & 0x3c) << 2) + 0x100; + let s = (s & 0x3ff) | (s & 0xfc00) << 2 | 0x808; + Self { + canonical: c as u32 | 0xed0000, + split: s as u32 | 0xf0000, + } + } + + #[inline] + fn is_match(&self, tbs: ThreeByteSeq) -> Option<SurrogateType> { + let tbs = tbs.value(); + if tbs == self.canonical { + Some(SurrogateType::Canonical) + } else if tbs >> 4 == self.split { + Some(SurrogateType::Split) + } else { + None + } + } +} + +#[unstable(feature = "needle", issue = "56345")] +#[derive(Debug, Clone)] +pub struct Wtf8Searcher<S> { + low: Option<LowSurrogateSearcher>, + middle: S, + high: Option<HighSurrogateSearcher>, +} + +pub fn new_wtf8_searcher(needle: &Wtf8) -> Wtf8Searcher<SliceSearcher<'_, u8>> { + let (low, middle, high) = needle.canonicalize(); + Wtf8Searcher { + low: low.map(LowSurrogateSearcher::new), + middle: SliceSearcher::new(middle), + high: high.map(HighSurrogateSearcher::new), + } +} + +pub fn new_wtf8_consumer(needle: &Wtf8) -> Wtf8Searcher<NaiveSearcher<'_, u8>> { + let (low, middle, high) = needle.canonicalize(); + Wtf8Searcher { + low: low.map(LowSurrogateSearcher::new), + middle: NaiveSearcher::new(middle), + high: high.map(HighSurrogateSearcher::new), + } +} + +fn compare_boundary_surrogates( + low: &Option<LowSurrogateSearcher>, + high: &Option<HighSurrogateSearcher>, + bytes: &[u8], + range: Range<usize>, + subrange: Range<usize>, +) -> Option<(SurrogateType, SurrogateType)> { + let low_type = if let Some(low) = low { + if subrange.start - range.start < 3 { + return None; + } + let tbs = unsafe { bytes.get_unchecked((subrange.start - 3)..subrange.start) }; + low.is_match(ThreeByteSeq::new(tbs))? + } else { + SurrogateType::Empty + }; + + let high_type = if let Some(high) = high { + if range.end - subrange.end < 3 { + return None; + } + let tbs = unsafe { bytes.get_unchecked(subrange.end..(subrange.end + 3)) }; + high.is_match(ThreeByteSeq::new(tbs))? + } else { + SurrogateType::Empty + }; + + Some((low_type, high_type)) +} + +fn span_as_inner(span: Span<&Wtf8>) -> Span<&[u8]> { + let (hay, range) = span.into_parts(); + unsafe { Span::from_parts(hay.as_inner(), range) } +} + +unsafe impl<'p> Searcher<Wtf8> for Wtf8Searcher<SliceSearcher<'p, u8>> { + #[inline] + fn search(&mut self, mut span: Span<&Wtf8>) -> Option<Range<usize>> { + let (hay, range) = span.clone().into_parts(); + while let Some(subrange) = self.middle.search(span_as_inner(span.clone())) { + if let Some((low_type, high_type)) = compare_boundary_surrogates( + &self.low, + &self.high, + hay.as_inner(), + range.clone(), + subrange.clone(), + ) { + return Some(extend_subrange(range, subrange, low_type, high_type)); + } else { + span = unsafe { Span::from_parts(hay, subrange.end..range.end) }; + } + } + None + } +} + +unsafe impl<'p> Consumer<Wtf8> for Wtf8Searcher<NaiveSearcher<'p, u8>> { + #[inline] + fn consume(&mut self, span: Span<&Wtf8>) -> Option<usize> { + let (hay, range) = span.into_parts(); + let bytes = hay[range.clone()].as_inner(); + let low_len = if self.low.is_some() { 3 } else { 0 }; + let high_len = if self.high.is_some() { 3 } else { 0 }; + let middle = self.middle.needle(); + let mut match_len = low_len + middle.len() + high_len; + if bytes.len() < match_len { + return None; + } + let middle_start = low_len; + let middle_end = match_len - high_len; + if &bytes[middle_start..middle_end] != middle { + return None; + } + if let Some(high) = &self.high { + if let SurrogateType::Split = high.is_match(ThreeByteSeq::new(&bytes[middle_end..]))? { + if bytes.len() != match_len { + match_len -= 1; + } + } + } + if let Some(low) = &self.low { + if let SurrogateType::Split = low.is_match(ThreeByteSeq::new(bytes))? { + if range.start != 0 { + match_len -= 1; + } + } + } + Some(range.start + match_len) + } +} + +unsafe impl<'p> ReverseSearcher<Wtf8> for Wtf8Searcher<SliceSearcher<'p, u8>> { + #[inline] + fn rsearch(&mut self, mut span: Span<&Wtf8>) -> Option<Range<usize>> { + let (hay, range) = span.clone().into_parts(); + while let Some(subrange) = self.middle.rsearch(span_as_inner(span.clone())) { + if let Some((low_type, high_type)) = compare_boundary_surrogates( + &self.low, + &self.high, + hay.as_inner(), + range.clone(), + subrange.clone(), + ) { + return Some(extend_subrange(range, subrange, low_type, high_type)); + } else { + span = unsafe { Span::from_parts(hay, range.start..subrange.start) }; + } + } + None + } +} + +unsafe impl<'p> ReverseConsumer<Wtf8> for Wtf8Searcher<NaiveSearcher<'p, u8>> { + #[inline] + fn rconsume(&mut self, span: Span<&Wtf8>) -> Option<usize> { + let (hay, range) = span.into_parts(); + let bytes = hay[range.clone()].as_inner(); + let low_len = if self.low.is_some() { 3 } else { 0 }; + let high_len = if self.high.is_some() { 3 } else { 0 }; + let middle = self.middle.needle(); + let mut match_len = low_len + middle.len() + high_len; + if bytes.len() < match_len { + return None; + } + let middle_end = bytes.len() - high_len; + let middle_start = middle_end - middle.len(); + if &bytes[middle_start..middle_end] != middle { + return None; + } + if let Some(low) = &self.low { + let start_index = bytes.len() - match_len; + if let SurrogateType::Split = low.is_match(ThreeByteSeq::new(&bytes[start_index..]))? { + if start_index != 0 { + match_len -= 1; + } + } + } + if let Some(high) = &self.high { + if let SurrogateType::Split = high.is_match(ThreeByteSeq::new(&bytes[middle_end..]))? { + if bytes.len() != range.end { + match_len -= 1; + } + } + } + Some(range.end - match_len) } } diff --git a/src/test/ui/issues/issue-2149.stderr b/src/test/ui/issues/issue-2149.stderr index 1df32aafa79c8..6097ed6339d63 100644 --- a/src/test/ui/issues/issue-2149.stderr +++ b/src/test/ui/issues/issue-2149.stderr @@ -10,7 +10,7 @@ error[E0599]: no method named `bind` found for type `[&str; 1]` in the current s --> $DIR/issue-2149.rs:13:12 | LL | ["hi"].bind(|x| [x] ); - | ^^^^ + | ^^^^ help: there is a method with a similar name: `find` | = help: items from traits can only be used if the trait is implemented and in scope = note: the following trait defines an item `bind`, perhaps you need to implement it: diff --git a/src/test/ui/issues/issue-48364.rs b/src/test/ui/issues/issue-48364.rs index 14ee75e7c9cb6..c348320fd2897 100644 --- a/src/test/ui/issues/issue-48364.rs +++ b/src/test/ui/issues/issue-48364.rs @@ -1,5 +1,5 @@ fn foo() -> bool { - b"".starts_with(stringify!(foo)) + b"".eq_ignore_ascii_case(stringify!(foo)) //~^ ERROR mismatched types } diff --git a/src/test/ui/issues/issue-48364.stderr b/src/test/ui/issues/issue-48364.stderr index 7a1ba5bb19361..e05f87294e872 100644 --- a/src/test/ui/issues/issue-48364.stderr +++ b/src/test/ui/issues/issue-48364.stderr @@ -1,8 +1,8 @@ error[E0308]: mismatched types - --> $DIR/issue-48364.rs:2:21 + --> $DIR/issue-48364.rs:2:30 | -LL | b"".starts_with(stringify!(foo)) - | ^^^^^^^^^^^^^^^ expected slice, found str +LL | b"".eq_ignore_ascii_case(stringify!(foo)) + | ^^^^^^^^^^^^^^^ expected slice, found str | = note: expected type `&[u8]` found type `&'static str`