Skip to content

Commit

Permalink
Major refactors
Browse files Browse the repository at this point in the history
- serde for most neurons
- better I/O across boundary
- more consistent use of ScoreMatrix
- benchmarks and tests
  • Loading branch information
clbarnes committed Jan 24, 2024
1 parent bae6b9f commit f68cbce
Show file tree
Hide file tree
Showing 21 changed files with 782 additions and 172 deletions.
4 changes: 3 additions & 1 deletion nblast-py/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ edition = "2021"
[dependencies]
pyo3 = { version = "0.20", features = ["extension-module", "abi3-py39"] }
neurarbor = "0.2.0"
nblast = { path = "../nblast-rs", version = "^0.7.1", features = ["parallel", "kiddo"] }
nblast = { path = "../nblast-rs", version = "^0.7.1", features = ["parallel", "kiddo", "serde"] }
numpy = "0.20"
ciborium = "0.2.2"
serde_json = "1.0.111"

[lib]
name = "pynblast"
Expand Down
3 changes: 2 additions & 1 deletion nblast-py/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ classifiers = [
"Programming Language :: Python :: 3.12",
]
dependencies = [
"numpy >= 1.22.4"
"numpy >= 1.22.4",
"backports.strenum; python_version < '3.11'"
]

[project.urls]
Expand Down
11 changes: 8 additions & 3 deletions nblast-py/python/pynblast/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
# -*- coding: utf-8 -*-

"""Top-level package for nblast-rs."""
"""Top-level package for pynblast.
The main entry point for this package is the `NblastArena` class.
"""

__author__ = """Chris L. Barnes"""
__email__ = "[email protected]"

from .pynblast import get_version as _get_version, ResamplingArbor
from .pynblast import get_version as _get_version, ResamplingArbor, backend as _backend

__version__ = _get_version()
__version_info__ = tuple(int(n) for n in __version__.split("."))
__version_info__ = tuple(int(n) for n in __version__.split(".")[:3])

from .util import rectify_tangents, Idx, Symmetry
from .arena import NblastArena
from .score_matrix import ScoreMatrix
from .smat_builder import ScoreMatrixBuilder

__all__ = [
"NblastArena",
Expand All @@ -21,4 +25,5 @@
"rectify_tangents",
"Idx",
"ResamplingArbor",
"ScoreMatrixBuilder",
]
85 changes: 62 additions & 23 deletions nblast-py/python/pynblast/arena.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from typing import List, Tuple, Dict, Iterator, Optional
import warnings
from copy import copy

import numpy as np

from .score_matrix import ScoreMatrix
from .pynblast import ArenaWrapper
from .util import Idx, raise_if_none, rectify_tangents, Symmetry
import pandas as pd

DEFAULT_THREADS = 0
DEFAULT_K = 20
Expand All @@ -12,13 +16,21 @@
class NblastArena:
"""
Class for creating and keeping track of many neurons for comparison with NBLAST.
Create the arena with a score matrix.
Then use `arena.add_points()` to add point clouds and return indices used for querying
(if you have already calculated tangents and alphas *using the same neighborhood size `k`*,
use `arena.add_points_tangents_alphas()`).
Use `arena.query_target()`, `arena.queries_targets()`, and
`arena.all_v_all()` to perform queries.
You can retrieve the points, tangents, alpha values, or everything about a neuron
with `arena.points()`, `arena.tangents()`, `arena.alphas()`, and `arena.neuron_table()`.
"""

def __init__(
self,
dist_bins: List[float],
dot_bins: List[float],
score_mat: np.ndarray,
score_mat: ScoreMatrix,
use_alpha: bool = False,
threads: Optional[int] = DEFAULT_THREADS,
k=DEFAULT_K,
Expand All @@ -27,18 +39,8 @@ def __init__(
The required arguments describe a lookup table which is used to convert
``(distance, abs_dot_product)`` tuples into a score for a single
point match.
The ``*_bins`` arguments describe the bounds of the bins:
N bounds make for N-1 bins.
Queries are clamped to the domain of the lookup.
``score_mat`` is the table of values, in dist-major order.
For example, if the lookup table was stored as a pandas dataframe,
where the distance bins were in the left margin and the absolute dot
product bins in the top margin, the object would be instantiated by
>>> arena = NblastArena(df.index, df.columns, df.to_numpy())
See the ``ScoreMatrix`` namedtuple for convenience.
Queries are clamped to the domain of the lookup
(i.e. the first and last values are effectively replaced with -inf and +inf respectively).
``k`` gives the number of points to use when calculating tangents.
Expand All @@ -51,11 +53,13 @@ def __init__(
self.threads = threads
self.k = k

if score_mat.shape != (len(dist_bins) - 1, len(dot_bins) - 1):
raise ValueError("Bin thresholds do not match score matrix")
score_vec = score_mat.flatten().tolist()
self._impl = ArenaWrapper(
dist_bins, dot_bins, score_vec, self.k, self.use_alpha, self.threads
score_mat.dist_thresholds.tolist(),
score_mat.dot_thresholds.tolist(),
score_mat._flat_values().tolist(),
self.k,
self.use_alpha,
self.threads,
)

def add_points(self, points: np.ndarray) -> Idx:
Expand All @@ -74,7 +78,7 @@ def add_points_tangents_alphas(
self,
points: np.ndarray,
tangents: np.ndarray,
alphas: Optional[np.ndarray],
alphas: np.ndarray,
) -> Idx:
"""Add an Nx3 point cloud representing a neuron, with pre-calculated tangents.
Tangents must be unit-length and in the same order as the points.
Expand All @@ -95,6 +99,10 @@ def add_points_tangents_alphas(
"Alpha values not given, but this NblastArena uses alpha weighting"
)
else:
warnings.warn(
"Alpha values should be given, even if they are not used",
PendingDeprecationWarning,
)
alphas = np.full(len(points), 1.0)
else:
alphas = np.asarray(alphas)
Expand Down Expand Up @@ -186,15 +194,15 @@ def points(self, idx) -> np.ndarray:
Order is arbitrary.
"""
return np.array(raise_if_none(self._impl.points(idx), idx))
return np.asarray(raise_if_none(self._impl.points(idx), idx))

def tangents(self, idx, rectify=False) -> np.ndarray:
"""Return a copy of the tangents associated with the indexed neuron.
Order is arbitrary, but consistent with the order returned by the
``.points`` method.
"""
out = np.array(raise_if_none(self._impl.tangents(idx), idx))
out = np.asarray(raise_if_none(self._impl.tangents(idx), idx))
return rectify_tangents(out, True) if rectify else out

def alphas(self, idx) -> np.ndarray:
Expand All @@ -203,4 +211,35 @@ def alphas(self, idx) -> np.ndarray:
Order is arbitrary, but consistent with the order returned by the
``.points`` method.
"""
return np.array(raise_if_none(self._impl.alphas(idx), idx))
return np.asarray(raise_if_none(self._impl.alphas(idx), idx))

def neuron_table(self, idx: Idx) -> pd.DataFrame:
"""Return a neuron's points, tangents, and alphas as a dataframe."""
arr = raise_if_none(self._impl.neuron_array(idx), idx)
return pd.DataFrame(
arr, columns=["x", "y", "z", "tangent_x", "tangent_y", "tangent_z", "alpha"]
)

# def serialize_neuron(
# self, idx: Idx, write_to: Optional[BytesIO], format: Format
# ) -> Optional[bytes]:
# b = raise_if_none(self._impl.serialize_neuron(idx, format), idx)
# if write_to is None:
# return b
# else:
# write_to.write(b)
# return None

# def add_serialized_neuron(self, read_from: BytesIO, format: Format) -> Idx:
# # todo: slower than adding p,t,a. Need better serialization?
# b = read_from.read()
# return self._impl.add_serialized_neuron(b, format)

def copy(self, deep=True):
out = copy(self)
if deep:
out._impl = out._impl.deepcopy()
return out

def __deepcopy__(self):
return self.copy()
22 changes: 16 additions & 6 deletions nblast-py/python/pynblast/pynblast.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
from __future__ import annotations
from typing import List, Optional, Tuple
from .util import Idx
import numpy as np
from numpy.typing import NDArray

F = np.float64

class ResamplingArbor:
def __init__(self, table: List[Tuple[int, Optional[int], float, float, float]]): ...
Expand All @@ -27,9 +31,9 @@ class ArenaWrapper:
use_alpha: bool,
threads: Optional[int],
) -> None: ...
def add_points(self, points: list[list[float]]) -> Idx: ...
def add_points(self, points: NDArray[F]) -> Idx: ...
def add_points_tangents_alphas(
self, points: list[list[float]], tangents: list[float], alphas: list[float]
self, points: NDArray[F], tangents: NDArray[F], alphas: NDArray[F]
) -> Idx: ...
def query_target(
self,
Expand All @@ -55,12 +59,16 @@ class ArenaWrapper:
def len(self) -> int: ...
def is_empty(self) -> bool: ...
def self_hit(self, idx: Idx) -> Optional[float]: ...
def points(self, idx: Idx) -> Optional[list[list[float]]]: ...
def tangents(self, idx: Idx) -> Optional[list[list[float]]]: ...
def alphas(self, idx: Idx) -> Optional[list[float]]: ...
def points(self, idx: Idx) -> Optional[NDArray[F]]: ...
def tangents(self, idx: Idx) -> Optional[NDArray[F]]: ...
def alphas(self, idx: Idx) -> Optional[NDArray[F]]: ...
def neuron_array(self, idx: Idx) -> Optional[NDArray[F]]: ...
def serialize_neuron(self, idx: Idx, format: str) -> Optional[bytes]: ...
def add_serialized_neuron(self, b: bytes, format: str) -> Idx: ...
def deepcopy(self): ...

def build_score_matrix(
points: List[List[List[float]]],
points: List[NDArray[np.float64]],
k: int,
seed: int,
use_alpha: bool,
Expand All @@ -74,3 +82,5 @@ def build_score_matrix(
max_nonmatching_pairs: Optional[int],
threads: Optional[int],
) -> Tuple[List[float], List[float], List[float]]: ...
def get_version() -> str: ...
def backend() -> str: ...
34 changes: 30 additions & 4 deletions nblast-py/python/pynblast/score_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import csv

import numpy as np
from numpy.typing import ArrayLike


def parse_interval(s) -> tuple[float, float]:
Expand All @@ -22,10 +23,35 @@ def intervals_to_bins(intervals: list[tuple[float, float]]):
return out


class ScoreMatrix(NamedTuple):
dist_thresholds: List[float]
dot_thresholds: List[float]
values: np.ndarray
class ScoreMatrix:
"""Representation of a lookup table for point match scores.
N thresholds represent N-1 bins.
The values are in dot-major order
(i.e. values in the same dist bin are next to each other).
"""

def __init__(
self,
dist_thresholds: ArrayLike,
dot_thresholds: ArrayLike,
values: ArrayLike,
) -> None:
self.dist_thresholds = np.asarray(dist_thresholds, np.float64).flatten()
self.dot_thresholds = np.asarray(dot_thresholds, np.float64).flatten()
self.values = np.asarray(values, np.float64)

exp_shape = (len(self.dist_thresholds) - 1, len(dot_thresholds) - 1)
if self.values.shape != exp_shape:
raise ValueError(
"For N dist_thresholds and M dot_thresholds, values must be (N-1)x(M-1)"
)

def _flat_values(self):
if self.values.flags["F_CONTIGUOUS"]:
return self.values.T.flatten()
else:
return self.values.flatten()

def to_df(self):
import pandas as pd
Expand Down
33 changes: 27 additions & 6 deletions nblast-py/python/pynblast/smat_builder.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,34 @@
from typing import List, Optional, Set, Union

import numpy as np
from numpy.typing import NDArray

from pynblast.arena import DEFAULT_K, DEFAULT_THREADS
from .arena import DEFAULT_K, DEFAULT_THREADS
from .score_matrix import ScoreMatrix
from .pynblast import build_score_matrix


class ScoreMatrixBuilder:
"""Class for training your own score matrix from data.
1. Create the builder
2. Add some point clouds to it with `.add_points()`, which returns an index for each
3. Use those indices to designate groups of neurons which should match each other with `.add_matching_set()`
4. Optionally designate groups which should not match each other `.add_nonmatching_set()`.
5. Set the dist and dot bins (`.set_{dist,dot}_bins()`).
These can be a list of N-1 inner boundaries for N bins,
or just the integer N,
in which case they will be determined from the data.
6. Optionally, set the maximum number of matching and/or nonmatching pairs with `.set_max_pairs()`.
7. `.build()`
"""

def __init__(self, seed: int, k: int = DEFAULT_K, use_alpha: bool = False):
self.seed = seed
self.k = k
self.use_alpha = use_alpha

self.neurons: List[List[List[float]]] = []
self.neurons: List[NDArray[np.float64]] = []

self.matching_sets: List[List[int]] = []
self.nonmatching_sets: Optional[List[List[int]]] = None
Expand All @@ -25,23 +41,25 @@ def __init__(self, seed: int, k: int = DEFAULT_K, use_alpha: bool = False):
self.max_matching_pairs: Optional[int] = None
self.max_nonmatching_pairs: Optional[int] = None

def add_neuron(self, points: np.ndarray) -> int:
def add_points(self, points: np.ndarray) -> int:
points = np.asarray(points)
if len(points) < self.k:
raise ValueError(f"Neuron does not have enough points (needs {self.k})")
if points.ndim != 2 or points.shape[1] != 3:
raise ValueError("Not an Nx3 array")
idx = len(self.neurons)
self.neurons.append(points.tolist())
self.neurons.append(points)
return idx

def add_matching_set(self, ids: Set[int]):
self.matching_sets.append(sorted(ids))
return self

def add_nonmatching_set(self, ids: Set[int]):
if self.nonmatching_sets is None:
self.nonmatching_sets = []
self.nonmatching_sets.append(sorted(ids))
return self

def set_dist_bins(self, bins: Union[int, List[float]]):
"""Number of bins, or list of inner boundaries of bins"""
Expand All @@ -51,6 +69,7 @@ def set_dist_bins(self, bins: Union[int, List[float]]):
else:
self.dist_inner_bounds = sorted(bins)
self.dist_n_bins = None
return self

def set_dot_bins(self, bins: Union[int, List[float]]):
"""Number of bins, or list of inner boundaries of bins"""
Expand All @@ -60,10 +79,12 @@ def set_dot_bins(self, bins: Union[int, List[float]]):
else:
self.dot_inner_bounds = sorted(bins)
self.dot_n_bins = None
return self

def set_max_pairs(self, matching: Optional[int], nonmatching: Optional[int]):
self.max_matching_pairs = matching
self.max_nonmatching_pairs = nonmatching
return self

def build(self, threads: Optional[int] = DEFAULT_THREADS):
dist_bins, dot_bins, cells = build_score_matrix(
Expand All @@ -81,5 +102,5 @@ def build(self, threads: Optional[int] = DEFAULT_THREADS):
self.max_nonmatching_pairs,
threads,
)
values = np.array(cells).reshape((len(dist_bins), len(dot_bins)))
return dist_bins, dot_bins, values
values = np.array(cells).reshape((len(dist_bins) - 1, len(dot_bins) - 1))
return ScoreMatrix(dist_bins, dot_bins, values)
Loading

0 comments on commit f68cbce

Please sign in to comment.