Major refactors

- serde for most neurons - better I/O across boundary - more consistent use of ScoreMatrix - benchmarks and tests
clbarnes · Jan 24, 2024 · f68cbce · f68cbce
1 parent bae6b9f
commit f68cbce
Show file tree

Hide file tree

Showing 21 changed files with 782 additions and 172 deletions.
diff --git a/nblast-py/Cargo.toml b/nblast-py/Cargo.toml
@@ -9,8 +9,10 @@ edition = "2021"
 [dependencies]
 pyo3 = { version = "0.20", features = ["extension-module", "abi3-py39"] }
 neurarbor = "0.2.0"
-nblast = { path = "../nblast-rs", version = "^0.7.1", features = ["parallel", "kiddo"] }
+nblast = { path = "../nblast-rs", version = "^0.7.1", features = ["parallel", "kiddo", "serde"] }
 numpy = "0.20"
+ciborium = "0.2.2"
+serde_json = "1.0.111"
 
 [lib]
 name = "pynblast"

diff --git a/nblast-py/pyproject.toml b/nblast-py/pyproject.toml
@@ -19,7 +19,8 @@ classifiers = [
   "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
-  "numpy >= 1.22.4"
+  "numpy >= 1.22.4",
+  "backports.strenum; python_version < '3.11'"
 ]
 
 [project.urls]

diff --git a/nblast-py/python/pynblast/__init__.py b/nblast-py/python/pynblast/__init__.py
@@ -1,18 +1,22 @@
 # -*- coding: utf-8 -*-
 
-"""Top-level package for nblast-rs."""
+"""Top-level package for pynblast.
+
+The main entry point for this package is the `NblastArena` class.
+"""
 
 __author__ = """Chris L. Barnes"""
 __email__ = "[email protected]"
 
-from .pynblast import get_version as _get_version, ResamplingArbor
+from .pynblast import get_version as _get_version, ResamplingArbor, backend as _backend
 
 __version__ = _get_version()
-__version_info__ = tuple(int(n) for n in __version__.split("."))
+__version_info__ = tuple(int(n) for n in __version__.split(".")[:3])
 
 from .util import rectify_tangents, Idx, Symmetry
 from .arena import NblastArena
 from .score_matrix import ScoreMatrix
+from .smat_builder import ScoreMatrixBuilder
 
 __all__ = [
     "NblastArena",
@@ -21,4 +25,5 @@
     "rectify_tangents",
     "Idx",
     "ResamplingArbor",
+    "ScoreMatrixBuilder",
 ]
diff --git a/nblast-py/python/pynblast/arena.py b/nblast-py/python/pynblast/arena.py
@@ -1,9 +1,13 @@
 from typing import List, Tuple, Dict, Iterator, Optional
+import warnings
+from copy import copy
 
 import numpy as np
 
+from .score_matrix import ScoreMatrix
 from .pynblast import ArenaWrapper
 from .util import Idx, raise_if_none, rectify_tangents, Symmetry
+import pandas as pd
 
 DEFAULT_THREADS = 0
 DEFAULT_K = 20
@@ -12,13 +16,21 @@
 class NblastArena:
     """
     Class for creating and keeping track of many neurons for comparison with NBLAST.
+
+    Create the arena with a score matrix.
+    Then use `arena.add_points()` to add point clouds and return indices used for querying
+    (if you have already calculated tangents and alphas *using the same neighborhood size `k`*,
+    use `arena.add_points_tangents_alphas()`).
+    Use `arena.query_target()`, `arena.queries_targets()`, and
+    `arena.all_v_all()` to perform queries.
+
+    You can retrieve the points, tangents, alpha values, or everything about a neuron
+    with `arena.points()`, `arena.tangents()`, `arena.alphas()`, and `arena.neuron_table()`.
     """
 
     def __init__(
         self,
-        dist_bins: List[float],
-        dot_bins: List[float],
-        score_mat: np.ndarray,
+        score_mat: ScoreMatrix,
         use_alpha: bool = False,
         threads: Optional[int] = DEFAULT_THREADS,
         k=DEFAULT_K,
@@ -27,18 +39,8 @@ def __init__(
         The required arguments describe a lookup table which is used to convert
         ``(distance, abs_dot_product)`` tuples into a score for a single
         point match.
-        The ``*_bins`` arguments describe the bounds of the bins:
-        N bounds make for N-1 bins.
-        Queries are clamped to the domain of the lookup.
-        ``score_mat`` is the table of values, in dist-major order.
-
-        For example, if the lookup table was stored as a pandas dataframe,
-        where the distance bins were in the left margin and the absolute dot
-        product bins in the top margin, the object would be instantiated by
-
-        >>> arena = NblastArena(df.index, df.columns, df.to_numpy())
-
-        See the ``ScoreMatrix`` namedtuple for convenience.
+        Queries are clamped to the domain of the lookup
+        (i.e. the first and last values are effectively replaced with -inf and +inf respectively).
 
         ``k`` gives the number of points to use when calculating tangents.
 
@@ -51,11 +53,13 @@ def __init__(
         self.threads = threads
         self.k = k
 
-        if score_mat.shape != (len(dist_bins) - 1, len(dot_bins) - 1):
-            raise ValueError("Bin thresholds do not match score matrix")
-        score_vec = score_mat.flatten().tolist()
         self._impl = ArenaWrapper(
-            dist_bins, dot_bins, score_vec, self.k, self.use_alpha, self.threads
+            score_mat.dist_thresholds.tolist(),
+            score_mat.dot_thresholds.tolist(),
+            score_mat._flat_values().tolist(),
+            self.k,
+            self.use_alpha,
+            self.threads,
         )
 
     def add_points(self, points: np.ndarray) -> Idx:
@@ -74,7 +78,7 @@ def add_points_tangents_alphas(
         self,
         points: np.ndarray,
         tangents: np.ndarray,
-        alphas: Optional[np.ndarray],
+        alphas: np.ndarray,
     ) -> Idx:
         """Add an Nx3 point cloud representing a neuron, with pre-calculated tangents.
         Tangents must be unit-length and in the same order as the points.
@@ -95,6 +99,10 @@ def add_points_tangents_alphas(
                     "Alpha values not given, but this NblastArena uses alpha weighting"
                 )
             else:
+                warnings.warn(
+                    "Alpha values should be given, even if they are not used",
+                    PendingDeprecationWarning,
+                )
                 alphas = np.full(len(points), 1.0)
         else:
             alphas = np.asarray(alphas)
@@ -186,15 +194,15 @@ def points(self, idx) -> np.ndarray:
 
         Order is arbitrary.
         """
-        return np.array(raise_if_none(self._impl.points(idx), idx))
+        return np.asarray(raise_if_none(self._impl.points(idx), idx))
 
     def tangents(self, idx, rectify=False) -> np.ndarray:
         """Return a copy of the tangents associated with the indexed neuron.
 
         Order is arbitrary, but consistent with the order returned by the
         ``.points`` method.
         """
-        out = np.array(raise_if_none(self._impl.tangents(idx), idx))
+        out = np.asarray(raise_if_none(self._impl.tangents(idx), idx))
         return rectify_tangents(out, True) if rectify else out
 
     def alphas(self, idx) -> np.ndarray:
@@ -203,4 +211,35 @@ def alphas(self, idx) -> np.ndarray:
         Order is arbitrary, but consistent with the order returned by the
         ``.points`` method.
         """
-        return np.array(raise_if_none(self._impl.alphas(idx), idx))
+        return np.asarray(raise_if_none(self._impl.alphas(idx), idx))
+
+    def neuron_table(self, idx: Idx) -> pd.DataFrame:
+        """Return a neuron's points, tangents, and alphas as a dataframe."""
+        arr = raise_if_none(self._impl.neuron_array(idx), idx)
+        return pd.DataFrame(
+            arr, columns=["x", "y", "z", "tangent_x", "tangent_y", "tangent_z", "alpha"]
+        )
+
+    # def serialize_neuron(
+    #     self, idx: Idx, write_to: Optional[BytesIO], format: Format
+    # ) -> Optional[bytes]:
+    #     b = raise_if_none(self._impl.serialize_neuron(idx, format), idx)
+    #     if write_to is None:
+    #         return b
+    #     else:
+    #         write_to.write(b)
+    #     return None
+
+    # def add_serialized_neuron(self, read_from: BytesIO, format: Format) -> Idx:
+    #     # todo: slower than adding p,t,a. Need better serialization?
+    #     b = read_from.read()
+    #     return self._impl.add_serialized_neuron(b, format)
+
+    def copy(self, deep=True):
+        out = copy(self)
+        if deep:
+            out._impl = out._impl.deepcopy()
+        return out
+
+    def __deepcopy__(self):
+        return self.copy()
diff --git a/nblast-py/python/pynblast/pynblast.pyi b/nblast-py/python/pynblast/pynblast.pyi
@@ -2,6 +2,10 @@
 from __future__ import annotations
 from typing import List, Optional, Tuple
 from .util import Idx
+import numpy as np
+from numpy.typing import NDArray
+
+F = np.float64
 
 class ResamplingArbor:
     def __init__(self, table: List[Tuple[int, Optional[int], float, float, float]]): ...
@@ -27,9 +31,9 @@ class ArenaWrapper:
         use_alpha: bool,
         threads: Optional[int],
     ) -> None: ...
-    def add_points(self, points: list[list[float]]) -> Idx: ...
+    def add_points(self, points: NDArray[F]) -> Idx: ...
     def add_points_tangents_alphas(
-        self, points: list[list[float]], tangents: list[float], alphas: list[float]
+        self, points: NDArray[F], tangents: NDArray[F], alphas: NDArray[F]
     ) -> Idx: ...
     def query_target(
         self,
@@ -55,12 +59,16 @@ class ArenaWrapper:
     def len(self) -> int: ...
     def is_empty(self) -> bool: ...
     def self_hit(self, idx: Idx) -> Optional[float]: ...
-    def points(self, idx: Idx) -> Optional[list[list[float]]]: ...
-    def tangents(self, idx: Idx) -> Optional[list[list[float]]]: ...
-    def alphas(self, idx: Idx) -> Optional[list[float]]: ...
+    def points(self, idx: Idx) -> Optional[NDArray[F]]: ...
+    def tangents(self, idx: Idx) -> Optional[NDArray[F]]: ...
+    def alphas(self, idx: Idx) -> Optional[NDArray[F]]: ...
+    def neuron_array(self, idx: Idx) -> Optional[NDArray[F]]: ...
+    def serialize_neuron(self, idx: Idx, format: str) -> Optional[bytes]: ...
+    def add_serialized_neuron(self, b: bytes, format: str) -> Idx: ...
+    def deepcopy(self): ...
 
 def build_score_matrix(
-    points: List[List[List[float]]],
+    points: List[NDArray[np.float64]],
     k: int,
     seed: int,
     use_alpha: bool,
@@ -74,3 +82,5 @@ def build_score_matrix(
     max_nonmatching_pairs: Optional[int],
     threads: Optional[int],
 ) -> Tuple[List[float], List[float], List[float]]: ...
+def get_version() -> str: ...
+def backend() -> str: ...
diff --git a/nblast-py/python/pynblast/score_matrix.py b/nblast-py/python/pynblast/score_matrix.py
@@ -2,6 +2,7 @@
 import csv
 
 import numpy as np
+from numpy.typing import ArrayLike
 
 
 def parse_interval(s) -> tuple[float, float]:
@@ -22,10 +23,35 @@ def intervals_to_bins(intervals: list[tuple[float, float]]):
     return out
 
 
-class ScoreMatrix(NamedTuple):
-    dist_thresholds: List[float]
-    dot_thresholds: List[float]
-    values: np.ndarray
+class ScoreMatrix:
+    """Representation of a lookup table for point match scores.
+
+    N thresholds represent N-1 bins.
+    The values are in dot-major order
+    (i.e. values in the same dist bin are next to each other).
+    """
+
+    def __init__(
+        self,
+        dist_thresholds: ArrayLike,
+        dot_thresholds: ArrayLike,
+        values: ArrayLike,
+    ) -> None:
+        self.dist_thresholds = np.asarray(dist_thresholds, np.float64).flatten()
+        self.dot_thresholds = np.asarray(dot_thresholds, np.float64).flatten()
+        self.values = np.asarray(values, np.float64)
+
+        exp_shape = (len(self.dist_thresholds) - 1, len(dot_thresholds) - 1)
+        if self.values.shape != exp_shape:
+            raise ValueError(
+                "For N dist_thresholds and M dot_thresholds, values must be (N-1)x(M-1)"
+            )
+
+    def _flat_values(self):
+        if self.values.flags["F_CONTIGUOUS"]:
+            return self.values.T.flatten()
+        else:
+            return self.values.flatten()
 
     def to_df(self):
         import pandas as pd

diff --git a/nblast-py/python/pynblast/smat_builder.py b/nblast-py/python/pynblast/smat_builder.py
@@ -1,18 +1,34 @@
 from typing import List, Optional, Set, Union
 
 import numpy as np
+from numpy.typing import NDArray
 
-from pynblast.arena import DEFAULT_K, DEFAULT_THREADS
+from .arena import DEFAULT_K, DEFAULT_THREADS
+from .score_matrix import ScoreMatrix
 from .pynblast import build_score_matrix
 
 
 class ScoreMatrixBuilder:
+    """Class for training your own score matrix from data.
+
+    1. Create the builder
+    2. Add some point clouds to it with `.add_points()`, which returns an index for each
+    3. Use those indices to designate groups of neurons which should match each other with `.add_matching_set()`
+    4. Optionally designate groups which should not match each other `.add_nonmatching_set()`.
+    5. Set the dist and dot bins (`.set_{dist,dot}_bins()`).
+       These can be a list of N-1 inner boundaries for N bins,
+       or just the integer N,
+       in which case they will be determined from the data.
+    6. Optionally, set the maximum number of matching and/or nonmatching pairs with `.set_max_pairs()`.
+    7. `.build()`
+    """
+
     def __init__(self, seed: int, k: int = DEFAULT_K, use_alpha: bool = False):
         self.seed = seed
         self.k = k
         self.use_alpha = use_alpha
 
-        self.neurons: List[List[List[float]]] = []
+        self.neurons: List[NDArray[np.float64]] = []
 
         self.matching_sets: List[List[int]] = []
         self.nonmatching_sets: Optional[List[List[int]]] = None
@@ -25,23 +41,25 @@ def __init__(self, seed: int, k: int = DEFAULT_K, use_alpha: bool = False):
         self.max_matching_pairs: Optional[int] = None
         self.max_nonmatching_pairs: Optional[int] = None
 
-    def add_neuron(self, points: np.ndarray) -> int:
+    def add_points(self, points: np.ndarray) -> int:
         points = np.asarray(points)
         if len(points) < self.k:
             raise ValueError(f"Neuron does not have enough points (needs {self.k})")
         if points.ndim != 2 or points.shape[1] != 3:
             raise ValueError("Not an Nx3 array")
         idx = len(self.neurons)
-        self.neurons.append(points.tolist())
+        self.neurons.append(points)
         return idx
 
     def add_matching_set(self, ids: Set[int]):
         self.matching_sets.append(sorted(ids))
+        return self
 
     def add_nonmatching_set(self, ids: Set[int]):
         if self.nonmatching_sets is None:
             self.nonmatching_sets = []
         self.nonmatching_sets.append(sorted(ids))
+        return self
 
     def set_dist_bins(self, bins: Union[int, List[float]]):
         """Number of bins, or list of inner boundaries of bins"""
@@ -51,6 +69,7 @@ def set_dist_bins(self, bins: Union[int, List[float]]):
         else:
             self.dist_inner_bounds = sorted(bins)
             self.dist_n_bins = None
+        return self
 
     def set_dot_bins(self, bins: Union[int, List[float]]):
         """Number of bins, or list of inner boundaries of bins"""
@@ -60,10 +79,12 @@ def set_dot_bins(self, bins: Union[int, List[float]]):
         else:
             self.dot_inner_bounds = sorted(bins)
             self.dot_n_bins = None
+        return self
 
     def set_max_pairs(self, matching: Optional[int], nonmatching: Optional[int]):
         self.max_matching_pairs = matching
         self.max_nonmatching_pairs = nonmatching
+        return self
 
     def build(self, threads: Optional[int] = DEFAULT_THREADS):
         dist_bins, dot_bins, cells = build_score_matrix(
@@ -81,5 +102,5 @@ def build(self, threads: Optional[int] = DEFAULT_THREADS):
             self.max_nonmatching_pairs,
             threads,
         )
-        values = np.array(cells).reshape((len(dist_bins), len(dot_bins)))
-        return dist_bins, dot_bins, values
+        values = np.array(cells).reshape((len(dist_bins) - 1, len(dot_bins) - 1))
+        return ScoreMatrix(dist_bins, dot_bins, values)