Skip to content

Commit 41ed334

Browse files
committedJun 22, 2024·
Claude SA-IS impl
1 parent d403ef9 commit 41ed334

File tree

2 files changed

+181
-0
lines changed

2 files changed

+181
-0
lines changed
 

‎src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use pyo3::prelude::*;
99
mod in_memory_index;
1010
mod memmap_index;
1111
mod par_quicksort;
12+
mod sa_is;
1213
mod table;
1314
mod util;
1415

‎src/sa_is.rs

+180
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
// SA-IS algorithm for suffix array construction
2+
fn sa_is(s: &[u16]) -> Vec<usize> {
3+
let n = s.len();
4+
let mut sa = vec![0; n];
5+
if n <= 1 {
6+
return sa; // Base case: for empty or single-character strings
7+
}
8+
9+
// Step 1: Classify each character as S-type or L-type
10+
let mut t = vec![false; n]; // false: L-type, true: S-type
11+
t[n - 1] = true; // Last character is always S-type
12+
for i in (0..n - 1).rev() {
13+
t[i] = if s[i] < s[i + 1] {
14+
true // S-type
15+
} else if s[i] > s[i + 1] {
16+
false // L-type
17+
} else {
18+
t[i + 1] // Same as next if equal
19+
};
20+
}
21+
22+
// Step 2: Bucket counting
23+
let mut bkt = vec![0; 65536]; // Bucket for each possible u16 value
24+
for &c in s {
25+
bkt[c as usize] += 1;
26+
}
27+
28+
// Calculate bucket heads
29+
let mut sum = 0;
30+
for i in 0..65536 {
31+
sum += bkt[i];
32+
bkt[i] = sum;
33+
}
34+
35+
// Step 3: Place LMS suffixes
36+
sa.fill(0);
37+
for i in (0..n - 1).rev() {
38+
if t[i] && !t[i + 1] { // If current is S and next is L, it's an LMS character
39+
bkt[s[i] as usize] -= 1;
40+
sa[bkt[s[i] as usize]] = i + 1;
41+
}
42+
}
43+
44+
// Step 4: Induce L-type suffixes
45+
induce_l(&mut sa, s, &mut bkt, &t);
46+
47+
// Step 5: Induce S-type suffixes
48+
induce_s(&mut sa, s, &mut bkt, &t);
49+
50+
sa
51+
}
52+
53+
// Helper function to induce L-type suffixes
54+
fn induce_l(sa: &mut [usize], s: &[u16], bkt: &mut [usize], t: &[bool]) {
55+
let n = s.len();
56+
// Reset bucket heads
57+
for i in 0..65536 {
58+
bkt[i] = if i == 0 { 0 } else { bkt[i - 1] };
59+
}
60+
for i in 0..n {
61+
if sa[i] > 0 && !t[sa[i] - 1] {
62+
let c = s[sa[i] - 1] as usize;
63+
sa[bkt[c]] = sa[i] - 1;
64+
bkt[c] += 1;
65+
}
66+
}
67+
}
68+
69+
// Helper function to induce S-type suffixes
70+
fn induce_s(sa: &mut [usize], s: &[u16], bkt: &mut [usize], t: &[bool]) {
71+
let n = s.len();
72+
// Reset bucket tails
73+
for i in (1..65536).rev() {
74+
bkt[i] = bkt[i - 1];
75+
}
76+
bkt[0] = 0;
77+
for i in (0..n).rev() {
78+
if sa[i] > 0 && t[sa[i] - 1] {
79+
let c = s[sa[i] - 1] as usize;
80+
bkt[c] -= 1;
81+
sa[bkt[c]] = sa[i] - 1;
82+
}
83+
}
84+
}
85+
86+
87+
#[cfg(test)]
88+
mod tests {
89+
use super::*;
90+
91+
// Helper function to verify if the suffix array is correct
92+
fn is_suffix_array_correct(s: &[u16], sa: &[usize]) -> bool {
93+
let n = s.len();
94+
if sa.len() != n {
95+
return false;
96+
}
97+
98+
let mut used = vec![false; n];
99+
for &pos in sa {
100+
if pos >= n || used[pos] {
101+
return false;
102+
}
103+
used[pos] = true;
104+
}
105+
106+
for i in 1..n {
107+
let suf1 = &s[sa[i - 1]..];
108+
let suf2 = &s[sa[i]..];
109+
if suf1 <= suf2 {
110+
continue;
111+
}
112+
return false;
113+
}
114+
115+
true
116+
}
117+
118+
#[test]
119+
fn test_empty_string() {
120+
let s: Vec<u16> = vec![];
121+
let sa = sa_is(&s);
122+
assert!(is_suffix_array_correct(&s, &sa));
123+
}
124+
125+
#[test]
126+
fn test_single_character() {
127+
let s: Vec<u16> = vec![42];
128+
let sa = sa_is(&s);
129+
assert!(is_suffix_array_correct(&s, &sa));
130+
}
131+
132+
#[test]
133+
fn test_two_characters() {
134+
let s: Vec<u16> = vec![2, 1];
135+
let sa = sa_is(&s);
136+
assert!(is_suffix_array_correct(&s, &sa));
137+
}
138+
139+
#[test]
140+
fn test_repeated_characters() {
141+
let s: Vec<u16> = vec![1, 1, 1, 1];
142+
let sa = sa_is(&s);
143+
assert!(is_suffix_array_correct(&s, &sa));
144+
}
145+
146+
#[test]
147+
fn test_ascending_sequence() {
148+
let s: Vec<u16> = vec![1, 2, 3, 4, 5];
149+
let sa = sa_is(&s);
150+
assert!(is_suffix_array_correct(&s, &sa));
151+
}
152+
153+
#[test]
154+
fn test_descending_sequence() {
155+
let s: Vec<u16> = vec![5, 4, 3, 2, 1];
156+
let sa = sa_is(&s);
157+
assert!(is_suffix_array_correct(&s, &sa));
158+
}
159+
160+
#[test]
161+
fn test_random_sequence() {
162+
let s: Vec<u16> = vec![10, 5, 8, 3, 1, 7, 2, 9, 4, 6];
163+
let sa = sa_is(&s);
164+
assert!(is_suffix_array_correct(&s, &sa));
165+
}
166+
167+
#[test]
168+
fn test_large_values() {
169+
let s: Vec<u16> = vec![65535, 0, 32768, 1, 65534];
170+
let sa = sa_is(&s);
171+
assert!(is_suffix_array_correct(&s, &sa));
172+
}
173+
174+
#[test]
175+
fn test_longer_sequence() {
176+
let s: Vec<u16> = (0..1000).map(|x| (x * 17 + 11) % 256).map(|x| x as u16).collect();
177+
let sa = sa_is(&s);
178+
assert!(is_suffix_array_correct(&s, &sa));
179+
}
180+
}

0 commit comments

Comments
 (0)
Please sign in to comment.