README, remove outdated license

norabelrose · norabelrose · commit 24798ce1622c · 2024-03-25T05:51:02.000Z
diff --git a/LICENSE.md b/LICENSE.md
@@ -18,8 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-**Third Party Licenses**
-
-This project also contains a modified version of the MIT licensed library `tokengrams-rs`, Copyright (c) 2021 Shunsuke Kanda.
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,38 @@
+# Tokengrams
+This library allows you to efficiently compute $n$-gram statistics for pre-tokenized text corpora used to train large language models. It does this not by explicitly pre-computing the $n$-gram counts for fixed $n$, but by creating a [suffix array](https://en.wikipedia.org/wiki/Suffix_array) index which allows you to efficiently compute the count of an $n$-gram on the fly for any $n$.
+
+Our code also allows you to turn your suffix array index into an efficient $n$-gram language model, which can be used to generate text or compute the perplexity of a given text.
+
+The backend is written in Rust, and the Python bindings are generated using [PyO3](https://github.com/PyO3/pyo3).
+
+# Installation
+Currently you need to build and install from source using `maturin`. We plan to release wheels on PyPI soon.
+
+```bash
+pip install maturin
+maturin develop
+```
+
+# Usage
+```python
+from tokengrams import MemmapIndex
+
+# Create a new index from an on-disk corpus called `document.bin` and save it to
+# `pile.idx`
+index = MemmapIndex.build(
+    "/mnt/ssd-1/pile_preshuffled/standard/document.bin",
+    "/mnt/ssd-1/nora/pile.idx",
+)
+
+# Get the count of "hello world" in the corpus
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-160m")
+print(index.count(tokenizer.encode("hello world")))
+
+# You can now load the index from disk later using __init__
+index = MemmapIndex(
+    "/mnt/ssd-1/pile_preshuffled/standard/document.bin",
+    "/mnt/ssd-1/nora/pile.idx"
+)
+```
diff --git a/prototyping.ipynb b/prototyping.ipynb
diff --git a/src/table.rs b/src/table.rs
@@ -1,31 +1,10 @@
-/* This code is almost entirely based on suffix from BurntSushi. The original
-* program was licensed under the MIT license. We have modified it for
-* for two reasons:
-*
-* 1. The original implementation used u32 indices to point into the
-*    suffix array. This is smaller and fairly cache efficient, but here
-*    in the Real World we have to work with Big Data and our datasets
-*    are bigger than 2^32 bytes. So we have to work with u64 instead.
-*
-* 2. The original implementation had a utf8 interface. This is very
-*    convenient if you're working with strings, but we are working with
-*    byte arrays almost exclusively, and so just cut out the strings.
-*
-* When the comments below contradict these two statements, that's why.
-*/
 extern crate utf16_literal;
 
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use std::{fmt, ops::Deref, u64};
 
 /// A suffix table is a sequence of lexicographically sorted suffixes.
-///
-/// This is distinct from a suffix array in that it *only* contains
-/// suffix indices. It has no "enhanced" information like the inverse suffix
-/// table or least-common-prefix lengths (LCP array). This representation
-/// limits what you can do (and how fast), but it uses very little memory
-/// (4 bytes per character in the text).
 #[derive(Clone, Deserialize, Eq, PartialEq, Serialize)]
 pub struct SuffixTable<T = Box<[u16]>, U = Box<[u64]>> {
     text: T,