move in memory index tests into private rust-side in memory index file

luciaquirke · luciaquirke · commit bc6d03676c4c · 2024-08-11T08:29:48.000Z
diff --git a/src/bindings/in_memory_index.rs b/src/bindings/in_memory_index.rs
@@ -39,22 +39,6 @@ pub trait InMemoryIndexTrait {
     fn estimate_deltas(&mut self, n: usize);
 }
 
-impl InMemoryIndex {
-    pub fn new(tokens: Vec<usize>, vocab: Option<usize>, verbose: bool) -> Self {
-        let vocab = vocab.unwrap_or(u16::MAX as usize + 1);
-
-        let index: Box<dyn InMemoryIndexTrait + Send + Sync> = if vocab <= u16::MAX as usize + 1 {
-            let tokens: Vec<u16> = tokens.iter().map(|&x| x as u16).collect();
-            Box::new(InMemoryIndexRs::<u16>::new(tokens, Some(vocab), verbose))
-        } else {
-            let tokens: Vec<u32> = tokens.iter().map(|&x| x as u32).collect();
-            Box::new(InMemoryIndexRs::<u32>::new(tokens, Some(vocab), verbose))
-        };
-
-        InMemoryIndex { index }
-    }
-}
-
 #[pymethods]
 impl InMemoryIndex {
     #[new]
diff --git a/src/bindings/memmap_index.rs b/src/bindings/memmap_index.rs
@@ -40,7 +40,7 @@ pub trait MemmapIndexTrait {
 impl MemmapIndex {
     #[new]
     #[pyo3(signature = (text_path, table_path, vocab=u16::MAX as usize + 1))]
-    pub fn new_py(
+    pub fn new(
         _py: Python,
         text_path: String,
         table_path: String,
@@ -146,17 +146,3 @@ impl MemmapIndex {
         self.index.estimate_deltas(n);
     }
 }
-
-impl MemmapIndex {
-    pub fn new(text_path: String, table_path: String, vocab: usize) -> Result<Self> {
-        if vocab <= u16::MAX as usize + 1 {
-            Ok(MemmapIndex {
-                index: Box::new(MemmapIndexRs::<u16>::new(text_path, table_path, vocab)?),
-            })
-        } else {
-            Ok(MemmapIndex {
-                index: Box::new(MemmapIndexRs::<u32>::new(text_path, table_path, vocab)?),
-            })
-        }
-    }
-}
diff --git a/src/in_memory_index.rs b/src/in_memory_index.rs
@@ -283,3 +283,113 @@ impl<T: Unsigned> InMemoryIndexTrait for InMemoryIndexRs<T> {
         <Self as Sample<T>>::estimate_deltas(self, n)
     }
 }
+
+
+#[cfg(test)]
+pub mod tests {
+    use super::*;
+    use utf16_literal::utf16;
+    use crate::table::SuffixTable;
+
+    fn sais(text: &str) -> SuffixTable {
+        SuffixTable::new(text.encode_utf16().collect::<Vec<_>>(), None, false)
+    }
+
+    fn utf16_as_usize(s: &str) -> Vec<usize> {
+        s.encode_utf16().map(|x| x as usize).collect()
+    }
+
+    #[test]
+    fn sample_unsmoothed_empty_query_exists() {
+        let s = utf16!("aaa");
+        let index: Box<dyn Sample<u16>> = Box::new(InMemoryIndexRs::new(s.to_vec(), None, false));
+
+        let seqs = index.sample_unsmoothed(&[], 3, 10, 1).unwrap();
+
+        assert_eq!(*seqs[0].last().unwrap(), s[0]);
+    }
+
+    #[test]
+    fn sample_unsmoothed_u16_exists() {
+        let s = utf16!("aaaa");
+        let a = &s[0..1];
+        let index: Box<dyn Sample<u16>> = Box::new(InMemoryIndexRs::new(s.to_vec(), None, false));
+
+        let seqs = index.sample_unsmoothed(a, 3, 10, 1).unwrap();
+
+        assert_eq!(*seqs[0].last().unwrap(), a[0]);
+    }
+
+    #[test]
+    fn sample_unsmoothed_u32_exists() {
+        let s: Vec<u32> = "aaaa".encode_utf16().map(|c| c as u32).collect();
+        let u32_vocab = Some(u16::MAX as usize + 2);
+        let index: Box<dyn Sample<u32>> = Box::new(InMemoryIndexRs::<u32>::new(s.clone(), u32_vocab, false));
+
+        let seqs = index.sample_unsmoothed(&s[0..1], 3, 10, 1).unwrap();
+
+        assert_eq!(*seqs[0].last().unwrap(), s[0]);
+    }
+
+    #[test]
+    fn sample_unsmoothed_usize_exists() {
+        let s = utf16_as_usize("aaaa");
+        let index: Box<dyn InMemoryIndexTrait> = Box::new(InMemoryIndexRs::new(s.to_vec(), None, false));
+
+        let seqs = index.sample_unsmoothed(s[0..1].to_vec(), 3, 10, 1).unwrap();
+
+        assert_eq!(*seqs[0].last().unwrap(), s[0]);
+    }
+
+    #[test]
+    fn sample_smoothed_exists() {
+        let s = utf16!("aabbccabccba");
+        let mut index: Box<dyn Sample<u16>> = Box::new(InMemoryIndexRs::new(s.to_vec(), None, false));
+
+        let tokens = &index.sample_smoothed(&s[0..1], 3, 10, 1).unwrap()[0];
+
+        assert_eq!(tokens.len(), 11);
+    }
+
+    #[test]
+    fn sample_smoothed_empty_query_exists() {
+        let s: Vec<u16> = "aabbccabccba".encode_utf16().collect();
+        let mut index: Box<dyn Sample<u16>> = Box::new(InMemoryIndexRs::new(s, None, false));
+
+        let tokens = &index.sample_smoothed(&[], 1, 10, 10).unwrap()[0];
+
+        assert_eq!(tokens.len(), 10);
+    }
+
+    #[test]
+    fn smoothed_probs_exists() {
+        let tokens = "aaaaaaaabc".to_string();
+        let tokens_vec: Vec<u16> = tokens.encode_utf16().collect();
+        let query: Vec<_> = vec![utf16!("b")[0]];
+
+        // Get unsmoothed probs for query
+        let sa: SuffixTable = sais(&tokens);
+        let bigram_counts = sa.count_next(&query);
+        let unsmoothed_probs = bigram_counts
+            .iter()
+            .map(|&x| x as f64 / bigram_counts.iter().sum::<usize>() as f64)
+            .collect::<Vec<f64>>();
+
+        // Get smoothed probs for query
+        let mut index: Box<dyn Sample<u16>> = Box::new(InMemoryIndexRs::new(tokens_vec, None, false));
+        let smoothed_probs = index.get_smoothed_probs(&query);
+
+        // Compare unsmoothed and smoothed probabilities
+        let a = utf16!("a")[0] as usize;
+        let c = utf16!("c")[0] as usize;
+
+        // The naive bigram probability for query 'b' is p(c) = 1.0.
+        assert!(unsmoothed_probs[a] == 0.0);
+        assert!(unsmoothed_probs[c] == 1.0);
+
+        // The smoothed bigram probabilities interpolate with the lower-order unigram
+        // probabilities where p(a) is high, lowering p(c)
+        assert!(smoothed_probs[a] > 0.1);
+        assert!(smoothed_probs[c] < 1.0);
+    }
+}
diff --git a/tests/tests.rs b/tests/tests.rs
@@ -2,7 +2,7 @@ extern crate quickcheck;
 extern crate utf16_literal;
 
 use quickcheck::{QuickCheck, Testable};
-use tokengrams::{InMemoryIndex, SuffixTable};
+use tokengrams::SuffixTable;
 use utf16_literal::utf16;
 
 fn sais(text: &str) -> SuffixTable {
@@ -13,10 +13,6 @@ fn qc<T: Testable>(f: T) {
     QuickCheck::new().tests(1000).max_tests(10000).quickcheck(f);
 }
 
-fn utf16_as_usize(s: &str) -> Vec<usize> {
-    s.encode_utf16().map(|x| x as usize).collect()
-}
-
 // Do some testing on substring search.
 
 #[test]
@@ -150,111 +146,3 @@ fn prop_positions() {
     }
     qc(prop as fn(String, u16) -> bool);
 }
-
-#[test]
-fn sample_unsmoothed_exists() {
-    let s = utf16_as_usize("aaaa");
-    let a = &s[0..1];
-
-    let index = InMemoryIndex::new(s.clone(), None, false);
-    let seqs = index.sample_unsmoothed(a.to_vec(), 3, 10, 20).unwrap();
-
-    assert_eq!(*seqs[0].last().unwrap(), a[0]);
-    assert_eq!(*seqs[19].last().unwrap(), a[0]);
-}
-
-#[test]
-fn sample_unsmoothed_empty_query_exists() {
-    let s = utf16_as_usize("aaa");
-    let a = s[0];
-    let index = InMemoryIndex::new(s.clone(), None, false);
-    let seqs = index.sample_unsmoothed(Vec::new(), 3, 10, 20).unwrap();
-
-    assert_eq!(*seqs[0].last().unwrap(), a);
-    assert_eq!(*seqs[19].last().unwrap(), a);
-}
-
-#[test]
-fn sample_smoothed_exists() {
-    let s = utf16_as_usize("aabbccabccba");
-    let mut index = InMemoryIndex::new(s.clone(), None, false);
-
-    let tokens = &index.sample_smoothed(s[0..1].to_vec(), 3, 10, 1).unwrap()[0];
-
-    assert_eq!(tokens.len(), 11);
-}
-
-#[test]
-fn sample_smoothed_unigrams_exists() {
-    let s = utf16_as_usize("aabbccabccba");
-    let mut index = InMemoryIndex::new(s.clone(), None, false);
-
-    let tokens = &index.sample_smoothed(s[0..1].to_vec(), 1, 10, 10).unwrap()[0];
-
-    assert_eq!(tokens.len(), 11);
-}
-
-#[test]
-fn prop_sample() {
-    fn prop(s: String) -> bool {
-        let s = utf16_as_usize(&s);
-        if s.len() < 2 {
-            return true;
-        }
-
-        let query = match s.get(0..1) {
-            Some(slice) => slice,
-            None => &[],
-        };
-        let index = InMemoryIndex::new(s.clone(), None, false);
-
-        let got = &index.sample_unsmoothed(query.to_vec(), 2, 1, 1).unwrap()[0];
-        s.contains(got.first().unwrap())
-    }
-
-    qc(prop as fn(String) -> bool);
-}
-
-#[test]
-fn smoothed_probs_exists() {
-    let tokens = "aaaaaaaabc".to_string();
-
-    let sa: SuffixTable = sais(&tokens);
-    let query = vec![utf16!("b")[0]];
-    let vocab = utf16!("c")[0] + 1;
-    let a = utf16!("a")[0] as usize;
-    let c = utf16!("c")[0] as usize;
-
-    let bigram_counts = sa.count_next(&query);
-    let unsmoothed_probs = bigram_counts
-        .iter()
-        .map(|&x| x as f64 / bigram_counts.iter().sum::<usize>() as f64)
-        .collect::<Vec<f64>>();
-
-    let s = utf16_as_usize(&tokens);
-    let query = utf16_as_usize("b");
-    let mut index = InMemoryIndex::new(s.clone(), Some(vocab as usize), false);
-    let smoothed_probs = index.get_smoothed_probs(query);
-
-    // The naive bigram probability for query 'b' is p(c) = 1.0.
-    assert!(unsmoothed_probs[a] == 0.0);
-    assert!(unsmoothed_probs[c] == 1.0);
-
-    // The smoothed bigram probabilities interpolate with the lower-order unigram
-    // probabilities where p(a) is high, lowering p(c)
-    assert!(smoothed_probs[a] > 0.1);
-    assert!(smoothed_probs[c] < 1.0);
-}
-
-#[test]
-fn smoothed_probs_empty_query_exists() {
-    let s = utf16_as_usize("aaa");
-    let vocab = s[0] + 1;
-
-    let mut index = InMemoryIndex::new(s, Some(vocab), false);
-
-    let probs = index.get_smoothed_probs(Vec::new());
-    let residual = (probs.iter().sum::<f64>() - 1.0).abs();
-
-    assert!(residual < 1e-4);
-}