Commit cd9d38a 1 parent 12e75a9 commit cd9d38a Copy full SHA for cd9d38a
File tree 1 file changed +21
-4
lines changed
1 file changed +21
-4
lines changed Original file line number Diff line number Diff line change @@ -856,10 +856,25 @@ where
856
856
if skip_special_tokens && self . added_vocabulary . is_special_token ( & token) {
857
857
None
858
858
} else if let Some ( pre_tok) = & self . pre_tokenizer {
859
- let mut string = PreTokenizedString :: from ( token) ;
860
- pre_tok. pre_tokenize ( & mut string) ;
861
- println ! ( "Pre-tok String: {}" , string. original) ;
862
- Some ( string. original )
859
+ let mut string = PreTokenizedString :: from ( token. clone ( ) ) ;
860
+ let _ = pre_tok. pre_tokenize ( & mut string) ;
861
+ println ! (
862
+ "Pre-tok String: {} vs token {} vs pret {:?}" ,
863
+ string. original,
864
+ token,
865
+ string
866
+ . get_splits( OffsetReferential :: Normalized , OffsetType :: Byte )
867
+ . get( 0 )
868
+ . unwrap( )
869
+ ) ;
870
+ Some (
871
+ string
872
+ . get_splits ( OffsetReferential :: Normalized , OffsetType :: Byte )
873
+ . get ( 0 )
874
+ . unwrap ( )
875
+ . 0
876
+ . to_string ( ) ,
877
+ )
863
878
} else {
864
879
println ! ( "String: {}" , token) ;
865
880
Some ( token)
@@ -1334,6 +1349,8 @@ mod test {
1334
1349
1335
1350
tokenizer. add_tokens ( & [ AddedToken :: from ( "Bác" , false ) ] ) ;
1336
1351
let encoded = tokenizer. encode ( "Hey Bác how are you?" , false ) . unwrap ( ) ;
1352
+ println ! ( "{:?}" , encoded. get_tokens( ) ) ;
1353
+ println ! ( "{:?}" , encoded. get_ids( ) ) ;
1337
1354
println ! ( "{}" , tokenizer. decode( encoded. get_ids( ) , false ) . unwrap( ) ) ;
1338
1355
}
1339
1356
}
You can’t perform that action at this time.
0 commit comments