Skip to content

Commit b080b34

Browse files
committed
removed prints for clarity
1 parent 8e59945 commit b080b34

File tree

3 files changed

+44
-36
lines changed

3 files changed

+44
-36
lines changed

bindings/python/benches/test_tiktoken.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,11 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
6363
out = enc.encode("This is a test")
6464

6565
hf_enc = Tokenizer.from_pretrained(model)
66+
hf_enc.pre_tokenizer = None
6667
out2 = hf_enc.encode("This is a test", add_special_tokens=False).ids
67-
68-
assert out == out2, "sanity check"
68+
print([hf_enc.decode([k]) for k in out2])
69+
print([hf_enc.decode([k]) for k in out])
70+
assert out == out2, f"sanity check {out} == {out2}, {hf_enc.decode(out)} == {hf_enc.decode(out2)}"
6971

7072
start = time.perf_counter_ns()
7173
enc.encode_ordinary_batch(documents, num_threads=num_threads)
@@ -74,7 +76,6 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
7476
readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
7577
print(f"tiktoken \t{readable_size} / s")
7678

77-
print(hf_enc)
7879
start = time.perf_counter_ns()
7980
hf_enc.encode_batch(documents)
8081
end = time.perf_counter_ns()

tokenizers/src/models/backtracking_bpe/model.rs

+40-32
Original file line numberDiff line numberDiff line change
@@ -230,16 +230,18 @@ fn is_valid_token_pair(
230230
) -> bool {
231231
// Keep track of the maximum token which can still be chosen across the split point.
232232
let mut limit = u32::MAX;
233+
// println!("checking if {token1}, {token2} is a valid token_pair");
233234
loop {
234235
// Check whether BPE would choose a different token pair across the split point.
235236
// this is super super important
236237
if let Some(combined) = pair_lookup.get(&(token1, token2)) {
237238
if *combined < limit {
239+
// println!("Done1");
238240
return false;
239241
}
240242
}
241243
// Reverse the merge operation from BPE.
242-
// println!("{token1}, {token2}");
244+
243245
// println!("{:?}", split_table);
244246
if token1 > token2 {
245247
limit = token1;
@@ -248,6 +250,7 @@ fn is_valid_token_pair(
248250
limit = token2 + 1;
249251
token2 = unsafe { split_table.get_unchecked(token2 as usize).0 };
250252
if token2 + 1 == limit {
253+
// println!("Done2");
251254
return true;
252255
}
253256
}
@@ -258,11 +261,13 @@ fn is_valid_token_pair(
258261
limit = token1;
259262
token1 = unsafe { split_table.get_unchecked(token1 as usize).1 };
260263
if token1 == limit {
264+
// println!("Done3");
261265
return true;
262266
}
263267
}
264268
}
265269
}
270+
266271
}
267272

268273
fn token_range(token_starts: &[u32], token_id: u32) -> Range<usize> {
@@ -477,36 +482,36 @@ impl BacktrackingBpe {
477482
let mut split_table = vec![];
478483
let mut pair_lookup = FnvHashMap::default();
479484

480-
// First option, use the input merge table.
481-
if let Some(ref merges) = merges {
482-
for (index, pair) in merges.into_iter().enumerate() {
483-
let token1 = &pair.0.clone();
484-
let token2 = &pair.1.clone();
485-
// TODO something is weird here
486-
if token1.len() ==1{
487-
split_table.push((vocab[token1], vocab[token1]));
488-
}
489-
if token2.len() == 1 {
490-
split_table.push((vocab[token2], vocab[token2]));
491-
}
492-
let id1 = vocab[token1];
493-
let id2 = vocab[token2];
494-
let new_token = format!("{}{}", token1, &token2);
495-
let new_id = vocab
496-
.get(&new_token)
497-
.ok_or(Error::MergeTokenOutOfVocabulary(new_token));
498-
if let Ok(id) = new_id {
499-
pair_lookup.insert((id1, id2), *id);
500-
split_table.push((id1, id2));
501-
merge_map.insert(Pair::from((id1, id2)), (index as u32, *id));
502-
} else {
503-
println!("Token not added?");
504-
}
505-
506-
// TODO wrong
507-
}
508-
split_table.push((merges.len() as u32, merges.len() as u32));
509-
}
485+
// // First option, use the input merge table.
486+
// if let Some(ref merges) = merges {
487+
// for (index, pair) in merges.into_iter().enumerate() {
488+
// let token1 = &pair.0.clone();
489+
// let token2 = &pair.1.clone();
490+
// // TODO something is weird here
491+
// if token1.len() ==1{
492+
// split_table.push((vocab[token1], vocab[token1]));
493+
// }
494+
// if token2.len() == 1 {
495+
// split_table.push((vocab[token2], vocab[token2]));
496+
// }
497+
// let id1 = vocab[token1];
498+
// let id2 = vocab[token2];
499+
// let new_token = format!("{}{}", token1, &token2);
500+
// let new_id = vocab
501+
// .get(&new_token)
502+
// .ok_or(Error::MergeTokenOutOfVocabulary(new_token));
503+
// if let Ok(id) = new_id {
504+
// pair_lookup.insert((id1, id2), *id);
505+
// split_table.push((id1, id2));
506+
// merge_map.insert(Pair::from((id1, id2)), (index as u32, *id));
507+
// } else {
508+
// println!("Token not added?");
509+
// }
510+
511+
// // TODO wrong
512+
// }
513+
// split_table.push((merges.len() as u32, merges.len() as u32));
514+
// }
510515
// Second option, reverse engineer the merge/split table from the vocabulary.
511516
{
512517
for (id, token) in token_iter(&all_tokens, &token_starts).enumerate() {
@@ -684,6 +689,7 @@ impl BacktrackingBpe {
684689
last_token.push(new_token);
685690
break;
686691
}
692+
// println!("Finished encoding prefix")
687693
}
688694
}
689695
}
@@ -729,9 +735,9 @@ impl BacktrackingBpe {
729735
let mut token = backtrack_state.next_token?;
730736
let last = backtrack_state.tokens.last().copied();
731737
loop {
738+
// println!("in step, token: {last:?}, {token}");
732739
let token_len = self.token_len(token);
733740
let end_pos = backtrack_state.pos + token_len;
734-
// println!("in step, token: {last:?}, {token}");
735741
if backtrack_state.bitfield.is_set(end_pos)
736742
&& last
737743
.map(|last_token| self.is_valid_token_pair(last_token, token))
@@ -755,6 +761,8 @@ impl BacktrackingBpe {
755761
break;
756762
}
757763
}
764+
// println!("finished step, token: {last:?}, {token}");
765+
758766
backtrack_state.next_token
759767
}
760768

tokenizers/src/models/backtracking_bpe/serialization.rs

-1
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,6 @@ mod test {
175175
}
176176
}
177177
println!("End of my example");
178-
exit(0);
179178
let vocab: Vocab = [
180179
("a".into(), 0),
181180
("b".into(), 1),

0 commit comments

Comments
 (0)