Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: EleutherAI/tokengrams
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: master
Choose a base ref
...
head repository: EleutherAI/tokengrams
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: sais-dash
Choose a head ref
Can’t automatically merge. Don’t worry, you can still create the pull request.
  • 5 commits
  • 10 files changed
  • 2 contributors

Commits on Jun 22, 2024

  1. Claude SA-IS impl

    norabelrose committed Jun 22, 2024
    Copy the full SHA
    41ed334 View commit details
  2. passing tests

    norabelrose committed Jun 22, 2024
    Copy the full SHA
    aa48222 View commit details
  3. Get rid of verbose

    norabelrose committed Jun 22, 2024
    Copy the full SHA
    8a46937 View commit details
  4. sais python functions

    norabelrose committed Jun 22, 2024
    Copy the full SHA
    f900916 View commit details

Commits on Jun 23, 2024

  1. Ok add parallel sorting of suffixes

    Signed-off-by: Dashiell Stander <[email protected]>
    dashstander committed Jun 23, 2024
    Copy the full SHA
    4ddbc90 View commit details
Showing with 628 additions and 954 deletions.
  1. +31 −4 src/in_memory_index.rs
  2. +1 −1 src/lib.rs
  3. +3 −5 src/memmap_index.rs
  4. +0 −931 src/par_quicksort.rs
  5. +523 −0 src/sais.rs
  6. +19 −6 src/table.rs
  7. +29 −0 src/util.rs
  8. +15 −4 tests/tests.rs
  9. +1 −1 tokengrams/tests/test_gram_index.py
  10. +6 −2 tokengrams/tokengrams.pyi
35 changes: 31 additions & 4 deletions src/in_memory_index.rs
Original file line number Diff line number Diff line change
@@ -16,9 +16,9 @@ pub struct InMemoryIndex {
#[pymethods]
impl InMemoryIndex {
#[new]
pub fn new(_py: Python, tokens: Vec<u16>, verbose: bool) -> Self {
pub fn new(_py: Python, tokens: Vec<u16>) -> Self {
InMemoryIndex {
table: SuffixTable::new(tokens, verbose),
table: SuffixTable::new(tokens),
}
}

@@ -32,7 +32,6 @@ impl InMemoryIndex {
#[staticmethod]
pub fn from_token_file(
path: String,
verbose: bool,
token_limit: Option<usize>,
) -> PyResult<Self> {
let mut buffer = Vec::new();
@@ -47,10 +46,38 @@ impl InMemoryIndex {
};

Ok(InMemoryIndex {
table: SuffixTable::new(transmute_slice(buffer.as_slice()), verbose),
table: SuffixTable::new(transmute_slice(buffer.as_slice())),
})
}

#[staticmethod]
pub fn from_token_file_sais(
path: String,
token_limit: Option<usize>,
) -> PyResult<Self> {
let mut buffer = Vec::new();
let mut file = File::open(&path)?;

if let Some(max_tokens) = token_limit {
// Limit on the number of tokens to consider is provided
let max_bytes = max_tokens * std::mem::size_of::<u16>();
file.take(max_bytes as u64).read_to_end(&mut buffer)?;
} else {
file.read_to_end(&mut buffer)?;
};

Ok(InMemoryIndex {
table: SuffixTable::new_sais(transmute_slice(buffer.as_slice())),
})
}

#[staticmethod]
pub fn sais(tokens: Vec<u16>) -> Self {
InMemoryIndex {
table: SuffixTable::new_sais(tokens),
}
}

pub fn contains(&self, query: Vec<u16>) -> bool {
self.table.contains(&query)
}
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@ use pyo3::prelude::*;

mod in_memory_index;
mod memmap_index;
mod par_quicksort;
mod sais;
mod table;
mod util;

8 changes: 3 additions & 5 deletions src/memmap_index.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use rayon::prelude::*;
use std::fs::{File, OpenOptions};
use std::time::Instant;

use crate::mmap_slice::{MmapSlice, MmapSliceMut};
use crate::par_quicksort::par_sort_unstable_by_key;
use crate::table::SuffixTable;

/// A memmap index exposes suffix table functionality over text corpora too large to fit in memory.
@@ -29,7 +29,7 @@ impl MemmapIndex {
}

#[staticmethod]
pub fn build(text_path: String, table_path: String, verbose: bool) -> PyResult<Self> {
pub fn build(text_path: String, table_path: String) -> PyResult<Self> {
// Memory map the text as read-only
let text_mmap = MmapSlice::new(&File::open(&text_path)?)?;

@@ -73,10 +73,8 @@ impl MemmapIndex {
// The unstable algorithm is critical for avoiding out-of-memory errors, since it does
// not allocate any more memory than the input and output slices.
println!("Sorting indices...");
par_sort_unstable_by_key(
table_mmap.as_slice_mut(),
table_mmap.par_sort_unstable_by_key(
|&i| &text_mmap[i as usize..],
verbose,
);
});
println!("Time elapsed: {:?}", start.elapsed());
Loading