Skip to content

Commit 9d389bc

Browse files
committed
that is still fairly ugly
1 parent 8c36539 commit 9d389bc

File tree

2 files changed

+4
-14
lines changed

2 files changed

+4
-14
lines changed

tokenizers/src/tokenizer/mod.rs

+3-13
Original file line numberDiff line numberDiff line change
@@ -862,19 +862,9 @@ where
862862
"Pre-tok String: {} vs token {} vs pret {:?}",
863863
string.original,
864864
token,
865-
string
866-
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
867-
.first()
868-
.unwrap()
865+
string.splits.first().unwrap().normalized.normalized.clone()
869866
);
870-
Some(
871-
string
872-
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
873-
.first()
874-
.unwrap()
875-
.0
876-
.to_string(),
877-
)
867+
Some(string.splits.first().unwrap().normalized.normalized.clone())
878868
} else {
879869
println!("String: {}", token);
880870
Some(token)
@@ -1334,7 +1324,7 @@ mod test {
13341324
let mut tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3-8B", None).unwrap();
13351325
tokenizer.add_tokens(&[AddedToken::from("ĠåĹİ", false)]); // this is the byte-level for 嗎
13361326
let encoded = tokenizer
1337-
.encode("Hey! how is this token: 嗎", false)
1327+
.encode("Hey! how is this token: 嗎 and ĠåĹİ", false)
13381328
.unwrap();
13391329
println!("Encoded tokens: {:?}", encoded.get_ids());
13401330
let decoded = tokenizer.decode(encoded.get_ids(), false);

tokenizers/src/tokenizer/normalizer.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ pub struct NormalizedString {
100100
/// The original version of the string, before any modification
101101
original: String,
102102
/// The normalized version of the string, after all modifications
103-
normalized: String,
103+
pub normalized: String,
104104
/// Mapping from normalized string to original one: (start, end) for each
105105
/// byte of the normalized string
106106
alignments: Vec<(usize, usize)>,

0 commit comments

Comments
 (0)