Skip to content

Commit cd9d38a

Browse files
committed
this looks better but damn how can it be so bloated
1 parent 12e75a9 commit cd9d38a

File tree

1 file changed

+21
-4
lines changed
  • tokenizers/src/tokenizer

1 file changed

+21
-4
lines changed

tokenizers/src/tokenizer/mod.rs

+21-4
Original file line numberDiff line numberDiff line change
@@ -856,10 +856,25 @@ where
856856
if skip_special_tokens && self.added_vocabulary.is_special_token(&token) {
857857
None
858858
} else if let Some(pre_tok) = &self.pre_tokenizer {
859-
let mut string = PreTokenizedString::from(token);
860-
pre_tok.pre_tokenize(&mut string);
861-
println!("Pre-tok String: {}", string.original);
862-
Some(string.original)
859+
let mut string = PreTokenizedString::from(token.clone());
860+
let _ = pre_tok.pre_tokenize(&mut string);
861+
println!(
862+
"Pre-tok String: {} vs token {} vs pret {:?}",
863+
string.original,
864+
token,
865+
string
866+
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
867+
.get(0)
868+
.unwrap()
869+
);
870+
Some(
871+
string
872+
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
873+
.get(0)
874+
.unwrap()
875+
.0
876+
.to_string(),
877+
)
863878
} else {
864879
println!("String: {}", token);
865880
Some(token)
@@ -1334,6 +1349,8 @@ mod test {
13341349

13351350
tokenizer.add_tokens(&[AddedToken::from("Bác", false)]);
13361351
let encoded = tokenizer.encode("Hey Bác how are you?", false).unwrap();
1352+
println!("{:?}", encoded.get_tokens());
1353+
println!("{:?}", encoded.get_ids());
13371354
println!("{}", tokenizer.decode(encoded.get_ids(), false).unwrap());
13381355
}
13391356
}

0 commit comments

Comments
 (0)