Commit 9d389bc 1 parent 8c36539 commit 9d389bc Copy full SHA for 9d389bc
File tree 2 files changed +4
-14
lines changed
2 files changed +4
-14
lines changed Original file line number Diff line number Diff line change @@ -862,19 +862,9 @@ where
862
862
"Pre-tok String: {} vs token {} vs pret {:?}" ,
863
863
string. original,
864
864
token,
865
- string
866
- . get_splits( OffsetReferential :: Normalized , OffsetType :: Byte )
867
- . first( )
868
- . unwrap( )
865
+ string. splits. first( ) . unwrap( ) . normalized. normalized. clone( )
869
866
) ;
870
- Some (
871
- string
872
- . get_splits ( OffsetReferential :: Normalized , OffsetType :: Byte )
873
- . first ( )
874
- . unwrap ( )
875
- . 0
876
- . to_string ( ) ,
877
- )
867
+ Some ( string. splits . first ( ) . unwrap ( ) . normalized . normalized . clone ( ) )
878
868
} else {
879
869
println ! ( "String: {}" , token) ;
880
870
Some ( token)
@@ -1334,7 +1324,7 @@ mod test {
1334
1324
let mut tokenizer = Tokenizer :: from_pretrained ( "meta-llama/Meta-Llama-3-8B" , None ) . unwrap ( ) ;
1335
1325
tokenizer. add_tokens ( & [ AddedToken :: from ( "ĠåĹİ" , false ) ] ) ; // this is the byte-level for 嗎
1336
1326
let encoded = tokenizer
1337
- . encode ( "Hey! how is this token: 嗎" , false )
1327
+ . encode ( "Hey! how is this token: 嗎 and ĠåĹİ " , false )
1338
1328
. unwrap ( ) ;
1339
1329
println ! ( "Encoded tokens: {:?}" , encoded. get_ids( ) ) ;
1340
1330
let decoded = tokenizer. decode ( encoded. get_ids ( ) , false ) ;
Original file line number Diff line number Diff line change @@ -100,7 +100,7 @@ pub struct NormalizedString {
100
100
/// The original version of the string, before any modification
101
101
original : String ,
102
102
/// The normalized version of the string, after all modifications
103
- normalized : String ,
103
+ pub normalized : String ,
104
104
/// Mapping from normalized string to original one: (start, end) for each
105
105
/// byte of the normalized string
106
106
alignments : Vec < ( usize , usize ) > ,
You can’t perform that action at this time.
0 commit comments