Is this what's expected?

ArthurZucker · ArthurZucker · commit e5976a63e5c9 · 2024-06-20T16:47:58.000+02:00
diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs
@@ -41,6 +41,14 @@ lazy_static! {
         r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
     )
     .unwrap();
+    static ref RE_VEC: Vec<SysRegex> = {
+        let pattern = r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
+        let mut vec = Vec::with_capacity(MAX_NUM_THREADS);
+        for _ in 0..MAX_NUM_THREADS {
+            vec.push(SysRegex::new(pattern).unwrap());
+        }
+        vec
+    };
     static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
     static ref CHAR_BYTES: HashMap<char, u8> =
         bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
@@ -111,12 +119,31 @@ impl ByteLevel {
     }
 }
 
+use std::num::NonZeroU64;
+use std::thread;
+
+pub struct FakeThreadId(NonZeroU64);
+
+fn hash_current_thread() -> usize {
+    // It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter
+    // that works great for our use case of avoiding collisions in our array. Unfortunately,
+    // it's private. However, there are only so many ways you can layout a u64, so just transmute
+    // https://github.com/rust-lang/rust/issues/67939
+    const _: [u8; 8] = [0; std::mem::size_of::<thread::ThreadId>()];
+    const _: [u8; 8] = [0; std::mem::size_of::<FakeThreadId>()];
+    let x =
+        unsafe { std::mem::transmute::<thread::ThreadId, FakeThreadId>(thread::current().id()).0 };
+    u64::from(x) as usize - 1
+}
+
+const MAX_NUM_THREADS: usize = 128;
+
 /// As a `PreTokenizer`, `ByteLevel` is in charge of transforming all the unicode characters into
 /// their byte-level counterpart. It also splits the input according to the configured regex.
 // TODO: Give the ability to modify this regex
 impl PreTokenizer for ByteLevel {
     fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
-        let re_ref: &SysRegex = &RE;
+        let re_ref: &SysRegex = &RE_VEC[hash_current_thread() % MAX_NUM_THREADS]; // TODO use the thread thing here as well!
         pretokenized.split(|_, mut normalized| {
             if self.add_prefix_space && !normalized.get().starts_with(' ') {
                 normalized.prepend(" ");
diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -514,7 +514,7 @@ impl AddedVocabulary {
         // 1. We extract all the non-normalized tokens from the non-normalized string
         pretokenized
             .split(|_, sequence| {
-                Ok(self.split_with_indices(
+                Ok(self.fast_split_with_indices(
                     sequence,
                     &self.split_trie_vec[hash_current_thread() % MAX_NUM_THREADS],
                 ))
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
@@ -894,7 +894,7 @@ where
     ) -> Result<Encoding> {
         let mut pretokenized: PreTokenizedString = pretokenized.into();
         pretokenized.tokenize(|normalized| self.model.tokenize(normalized.get()))?;
-        pretokenized.into_encoding(word_idx, type_id, offsets_type)
+        pretokenized.fast_into_encoding()
     }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -894,7 +894,7 @@ where`
`894`	`894`	`) -> Result<Encoding> {`
`895`	`895`	`let mut pretokenized: PreTokenizedString = pretokenized.into();`
`896`	`896`	`pretokenized.tokenize(\|normalized\| self.model.tokenize(normalized.get()))?;`
`897`		`- pretokenized.into_encoding(word_idx, type_id, offsets_type)`
	`897`	`+ pretokenized.fast_into_encoding()`
`898`	`898`	`}`
`899`	`899`	`}`
`900`	`900`