@@ -41,6 +41,14 @@ lazy_static! {
41
41
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
42
42
)
43
43
. unwrap( ) ;
44
+ static ref RE_VEC : Vec <SysRegex > = {
45
+ let pattern = r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" ;
46
+ let mut vec = Vec :: with_capacity( MAX_NUM_THREADS ) ;
47
+ for _ in 0 ..MAX_NUM_THREADS {
48
+ vec. push( SysRegex :: new( pattern) . unwrap( ) ) ;
49
+ }
50
+ vec
51
+ } ;
44
52
static ref BYTES_CHAR : HashMap <u8 , char > = bytes_char( ) ;
45
53
static ref CHAR_BYTES : HashMap <char , u8 > =
46
54
bytes_char( ) . into_iter( ) . map( |( c, b) | ( b, c) ) . collect( ) ;
@@ -111,12 +119,31 @@ impl ByteLevel {
111
119
}
112
120
}
113
121
122
+ use std:: num:: NonZeroU64 ;
123
+ use std:: thread;
124
+
125
+ pub struct FakeThreadId ( NonZeroU64 ) ;
126
+
127
+ fn hash_current_thread ( ) -> usize {
128
+ // It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter
129
+ // that works great for our use case of avoiding collisions in our array. Unfortunately,
130
+ // it's private. However, there are only so many ways you can layout a u64, so just transmute
131
+ // https://github.com/rust-lang/rust/issues/67939
132
+ const _: [ u8 ; 8 ] = [ 0 ; std:: mem:: size_of :: < thread:: ThreadId > ( ) ] ;
133
+ const _: [ u8 ; 8 ] = [ 0 ; std:: mem:: size_of :: < FakeThreadId > ( ) ] ;
134
+ let x =
135
+ unsafe { std:: mem:: transmute :: < thread:: ThreadId , FakeThreadId > ( thread:: current ( ) . id ( ) ) . 0 } ;
136
+ u64:: from ( x) as usize - 1
137
+ }
138
+
139
+ const MAX_NUM_THREADS : usize = 128 ;
140
+
114
141
/// As a `PreTokenizer`, `ByteLevel` is in charge of transforming all the unicode characters into
115
142
/// their byte-level counterpart. It also splits the input according to the configured regex.
116
143
// TODO: Give the ability to modify this regex
117
144
impl PreTokenizer for ByteLevel {
118
145
fn pre_tokenize ( & self , pretokenized : & mut PreTokenizedString ) -> Result < ( ) > {
119
- let re_ref: & SysRegex = & RE ;
146
+ let re_ref: & SysRegex = & RE_VEC [ hash_current_thread ( ) % MAX_NUM_THREADS ] ; // TODO use the thread thing here as well!
120
147
pretokenized. split ( |_, mut normalized| {
121
148
if self . add_prefix_space && !normalized. get ( ) . starts_with ( ' ' ) {
122
149
normalized. prepend ( " " ) ;
0 commit comments