fix the unigram::from calls

ArthurZucker · ArthurZucker · commit 81d83361d0bf · 2024-10-05T17:58:22.000+02:00
diff --git a/tokenizers/src/models/unigram/model.rs b/tokenizers/src/models/unigram/model.rs
@@ -548,7 +548,8 @@ mod tests {
             ("abcd".to_string(), 10.0),
         ];
 
-        let model = Unigram::from(sentencepieces, Some(0), false).unwrap();
+        let model =
+            Unigram::from(sentencepieces, Some(0), false, &AddedVocabulary::default()).unwrap();
         let result = model.encode("abcd").unwrap();
         assert_eq!(result, vec!["abcd"]);
     }
@@ -570,7 +571,8 @@ mod tests {
             ("qr".to_string(), -0.5),
         ];
 
-        let mut model = Unigram::from(sentencepieces, Some(0), false).unwrap();
+        let mut model =
+            Unigram::from(sentencepieces, Some(0), false, &AddedVocabulary::default()).unwrap();
 
         for is_optimized in &[true, false] {
             model.set_optimized(*is_optimized);
@@ -617,7 +619,8 @@ mod tests {
             ("<0xC3>".to_string(), -0.01),
             ("<0xA9>".to_string(), -0.03),
         ];
-        let unigram = Unigram::from(sentencepieces, Some(0), true).unwrap();
+        let unigram =
+            Unigram::from(sentencepieces, Some(0), true, &AddedVocabulary::default()).unwrap();
         let tokens: Vec<Token> = unigram.tokenize("é").unwrap();
         assert_eq!(
             tokens,
diff --git a/tokenizers/src/models/unigram/serialization.rs b/tokenizers/src/models/unigram/serialization.rs
@@ -1,3 +1,5 @@
+use crate::AddedVocabulary;
+
 use super::model::Unigram;
 use serde::{
     de::{Error, MapAccess, Visitor},
@@ -69,8 +71,12 @@ impl<'de> Visitor<'de> for UnigramVisitor {
             }
         }
         match (vocab, unk_id, byte_fallback) {
-            (Some(vocab), unk_id, byte_fallback) => Ok(Unigram::from(vocab, unk_id, byte_fallback)
-                .map_err(|err| Error::custom(format!("Unable to load vocab {err:?}")))?),
+            (Some(vocab), unk_id, byte_fallback) => {
+                Ok(
+                    Unigram::from(vocab, unk_id, byte_fallback, &AddedVocabulary::default())
+                        .map_err(|err| Error::custom(format!("Unable to load vocab {err:?}")))?,
+                )
+            }
             (None, _, _) => Err(Error::custom("Missing vocab")),
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+use crate::AddedVocabulary;`
	`2`	`+`
`1`	`3`	`use super::model::Unigram;`
`2`	`4`	`use serde::{`
`3`	`5`	`de::{Error, MapAccess, Visitor},`
`@@ -69,8 +71,12 @@ impl<'de> Visitor<'de> for UnigramVisitor {`
`69`	`71`	`}`
`70`	`72`	`}`
`71`	`73`	`match (vocab, unk_id, byte_fallback) {`
`72`		`- (Some(vocab), unk_id, byte_fallback) => Ok(Unigram::from(vocab, unk_id, byte_fallback)`
`73`		`- .map_err(\|err\| Error::custom(format!("Unable to load vocab {err:?}")))?),`
	`74`	`+ (Some(vocab), unk_id, byte_fallback) => {`
	`75`	`+ Ok(`
	`76`	`+ Unigram::from(vocab, unk_id, byte_fallback, &AddedVocabulary::default())`
	`77`	`+ .map_err(\|err\| Error::custom(format!("Unable to load vocab {err:?}")))?,`
	`78`	`+ )`
	`79`	`+ }`
`74`	`80`	`(None, _, _) => Err(Error::custom("Missing vocab")),`
`75`	`81`	`}`
`76`	`82`	`}`