1
1
extern crate quickcheck;
2
2
extern crate utf16_literal;
3
+ use std:: fs:: File ;
4
+ use std:: io:: prelude:: * ;
3
5
4
6
use quickcheck:: { QuickCheck , Testable } ;
5
- use tokengrams:: SuffixTable ;
7
+ use tokengrams:: { SuffixTable , Sampler , InMemoryIndex , CountableIndex , SamplerBuilder , SampleableIndex } ;
6
8
use utf16_literal:: utf16;
7
9
8
10
fn sais ( text : & str ) -> SuffixTable {
@@ -152,88 +154,101 @@ fn prop_positions() {
152
154
fn sample_unsmoothed_exists ( ) {
153
155
let sa = sais ( "aaa" ) ;
154
156
let a = utf16 ! ( "a" ) ;
155
- let seqs = sa. sample_unsmoothed ( a, 3 , 10 , 20 , None ) . unwrap ( ) ;
157
+ // Create temporary token file containing contents of suffix array [97, 97, 97]
158
+ // let mut file = File::create("tmp.bin")?;
159
+
160
+ let sampler = Sampler :: new ( SampleableIndex :: Countable ( sa) ) ;
161
+ // let sampler = SamplerBuilder::default().index().build().unwrap();
162
+ let seqs = sampler. sample_unsmoothed ( a, 3 , 10 , 20 , None ) . unwrap ( ) ;
156
163
157
164
assert_eq ! ( * seqs[ 0 ] . last( ) . unwrap( ) , a[ 0 ] ) ;
158
165
assert_eq ! ( * seqs[ 19 ] . last( ) . unwrap( ) , a[ 0 ] ) ;
159
166
}
160
167
161
- #[ test]
162
- fn sample_unsmoothed_empty_query_exists ( ) {
163
- let sa = sais ( "aaa" ) ;
164
- let seqs = sa . sample_unsmoothed ( utf16 ! ( "" ) , 3 , 10 , 20 , None ) . unwrap ( ) ;
168
+ // #[test]
169
+ // fn sample_unsmoothed_empty_query_exists() {
170
+ // let sampler = Sampler::new(CountableIndex::suffix_table( "aaa") );
171
+ // let seqs = sampler .sample_unsmoothed(utf16!(""), 3, 10, 20, None).unwrap();
165
172
166
- assert_eq ! ( * seqs[ 0 ] . last( ) . unwrap( ) , utf16!( "a" ) [ 0 ] ) ;
167
- assert_eq ! ( * seqs[ 19 ] . last( ) . unwrap( ) , utf16!( "a" ) [ 0 ] ) ;
168
- }
173
+ // assert_eq!(*seqs[0].last().unwrap(), utf16!("a")[0]);
174
+ // assert_eq!(*seqs[19].last().unwrap(), utf16!("a")[0]);
175
+ // }
169
176
170
- #[ test]
171
- fn sample_smoothed_exists ( ) {
172
- let mut sa = sais ( "aabbccabccba" ) ;
173
- let tokens = & sa. sample_smoothed ( utf16 ! ( "a" ) , 3 , 10 , 1 , None ) . unwrap ( ) [ 0 ] ;
174
-
175
- assert_eq ! ( tokens. len( ) , 11 ) ;
176
- }
177
+ // #[test]
178
+ // fn sample_smoothed_exists() {
179
+ // let tokens = "aabbccabccba".to_string();
180
+ // let mut sampler = Sampler::new(CountableIndex::suffix_table(&tokens));
181
+ // let tokens = &sampler.sample_smoothed(utf16!("a"), 3, 10, 1, None).unwrap()[0];
177
182
178
- #[ test]
179
- fn sample_smoothed_unigrams_exists ( ) {
180
- let mut sa = sais ( "aabbccabccba" ) ;
181
- let tokens = & sa. sample_smoothed ( utf16 ! ( "a" ) , 1 , 10 , 10 , None ) . unwrap ( ) [ 0 ] ;
182
-
183
- assert_eq ! ( tokens. len( ) , 11 ) ;
184
- }
185
-
186
- #[ test]
187
- fn prop_sample ( ) {
188
- fn prop ( s : String ) -> bool {
189
- let s = s. encode_utf16 ( ) . collect :: < Vec < _ > > ( ) ;
190
- if s. len ( ) < 2 {
191
- return true ;
192
- }
193
-
194
- let table = SuffixTable :: new ( s. clone ( ) , false ) ;
183
+ // assert_eq!(tokens.len(), 11);
184
+ // }
195
185
196
- let query = match s. get ( 0 ..1 ) {
197
- Some ( slice) => slice,
198
- None => & [ ] ,
199
- } ;
200
- let got = & table. sample_unsmoothed ( query, 2 , 1 , 1 , None ) . unwrap ( ) [ 0 ] ;
201
- s. contains ( got. first ( ) . unwrap ( ) )
202
- }
186
+ // #[test]
187
+ // fn sample_smoothed_unigrams_exists() {
188
+ // let tokens = "aabbccabccba".to_string();
189
+ // let mut sampler = Sampler::new(CountableIndex::suffix_table(&tokens));
190
+ // let tokens = &sampler.sample_smoothed(utf16!("a"), 1, 10, 10, None).unwrap()[0];
203
191
204
- qc ( prop as fn ( String ) -> bool ) ;
205
- }
192
+ // assert_eq!(tokens.len(), 11 );
193
+ // }
206
194
207
- #[ test]
208
- fn smoothed_probs_exists ( ) {
209
- let mut sa = sais ( "aaaaaaaabc" ) ;
210
- let query = vec ! [ utf16!( "b" ) [ 0 ] ] ;
211
- let vocab = utf16 ! ( "c" ) [ 0 ] + 1 ;
212
- let a = utf16 ! ( "a" ) [ 0 ] as usize ;
213
- let c = utf16 ! ( "c" ) [ 0 ] as usize ;
195
+ // #[test]
196
+ // fn prop_sample() {
197
+ // fn prop(s: String) -> bool {
198
+ // let sampler = Sampler::new(CountableIndex::suffix_table(&s));
214
199
215
- let smoothed_probs = sa. get_smoothed_probs ( & query, Some ( vocab) ) ;
216
- let bigram_counts = sa. count_next ( & query, Some ( vocab) ) ;
217
- let unsmoothed_probs = bigram_counts
218
- . iter ( )
219
- . map ( |& x| x as f64 / bigram_counts. iter ( ) . sum :: < usize > ( ) as f64 )
220
- . collect :: < Vec < f64 > > ( ) ;
200
+ // let s = s.encode_utf16().collect::<Vec<_>>();
201
+ // if s.len() < 2 {
202
+ // return true;
203
+ // }
204
+
205
+ // // let table = SuffixTable::new(s.clone(), false);
206
+ // // let mut sampler = Sampler::new(CountableIndex::suffix_table(s));
207
+
208
+ // let query = match s.get(0..1) {
209
+ // Some(slice) => slice,
210
+ // None => &[],
211
+ // };
212
+ // let got = &sampler.sample_unsmoothed(query, 2, 1, 1, None).unwrap()[0];
213
+ // s.contains(got.first().unwrap())
214
+ // }
215
+
216
+ // qc(prop as fn(String) -> bool);
217
+ // }
218
+
219
+ // #[test]
220
+ // fn smoothed_probs_exists() {
221
+ // let tokens = "aaaaaaaabc".to_string();
222
+ // let mut sampler = Sampler::new(CountableIndex::suffix_table(&tokens));
223
+ // let mut sa = sais(&tokens);
224
+ // let query = vec![utf16!("b")[0]];
225
+ // let vocab = utf16!("c")[0] + 1;
226
+ // let a = utf16!("a")[0] as usize;
227
+ // let c = utf16!("c")[0] as usize;
228
+
229
+ // let smoothed_probs = sampler.get_smoothed_probs(&query, Some(vocab));
230
+ // let bigram_counts = sa.count_next(&query, Some(vocab));
231
+ // let unsmoothed_probs = bigram_counts
232
+ // .iter()
233
+ // .map(|&x| x as f64 / bigram_counts.iter().sum::<usize>() as f64)
234
+ // .collect::<Vec<f64>>();
221
235
222
- // The naive bigram probability for query 'b' is p(c) = 1.0.
223
- assert ! ( unsmoothed_probs[ a] == 0.0 ) ;
224
- assert ! ( unsmoothed_probs[ c] == 1.0 ) ;
236
+ // // The naive bigram probability for query 'b' is p(c) = 1.0.
237
+ // assert!(unsmoothed_probs[a] == 0.0);
238
+ // assert!(unsmoothed_probs[c] == 1.0);
225
239
226
- // The smoothed bigram probabilities interpolate with the lower-order unigram
227
- // probabilities where p(a) is high, lowering p(c)
228
- assert ! ( smoothed_probs[ a] > 0.1 ) ;
229
- assert ! ( smoothed_probs[ c] < 1.0 ) ;
230
- }
231
-
232
- #[ test]
233
- fn smoothed_probs_empty_query_exists ( ) {
234
- let mut sa = sais ( "aaa" ) ;
235
- let probs = sa. get_smoothed_probs ( & [ ] , Some ( utf16 ! ( "a" ) [ 0 ] + 1 ) ) ;
236
- let residual = ( probs. iter ( ) . sum :: < f64 > ( ) - 1.0 ) . abs ( ) ;
237
-
238
- assert ! ( residual < 1e-4 ) ;
239
- }
240
+ // // The smoothed bigram probabilities interpolate with the lower-order unigram
241
+ // // probabilities where p(a) is high, lowering p(c)
242
+ // assert!(smoothed_probs[a] > 0.1);
243
+ // assert!(smoothed_probs[c] < 1.0);
244
+ // }
245
+
246
+ // #[test]
247
+ // fn smoothed_probs_empty_query_exists() {
248
+ // let tokens = "aaa".to_string();
249
+ // let mut sampler = Sampler::new(CountableIndex::suffix_table(&tokens));
250
+ // let probs = sampler.get_smoothed_probs(&[], Some(utf16!("a")[0] + 1));
251
+ // let residual = (probs.iter().sum::<f64>() - 1.0).abs();
252
+
253
+ // assert!(residual < 1e-4);
254
+ // }
0 commit comments