🪞🧬 Text representation for biomedical entities via PyOBO (pykeen#1055)

cthoyt · mberr · web-flow · commit f7e67143f56d · 2023-01-30T01:19:46.000Z
Similarly to the `WikidataTextRepresentation`, this uses `PyOBO` as a
service for looking up labels for entities encoded with `CURIE`s
appearing in biomedical knowledge graphs. Unfortunately, the semantics
of all of the existing biomedical knowledge graphs are garbage, and
don't use standardized identifiers, so this isn't applicable for
anything built-in at the moment. An example for generating a graph where
this works is given.

Requirements:

```shell
python -m pip install pyobo bioontologies
```


Example with very tiny dataet:

```python
import numpy as np
from pykeen.datasets import EagerDataset
from pykeen.nn import BiomedicalCURIERepresentation
from pykeen.triples import TriplesFactory

triples = [
    ('uberon:0000004', 'ro:0002216', 'go:0007608'),
]
triples = TriplesFactory.from_labeled_triples(np.array(triples))
dataset = EagerDataset(triples, triples, triples)
dataset.summarize()

entity_representations = BiomedicalCURIERepresentation.from_dataset(dataset=dataset, encoder="transformer")
print(entity_representations)
```

Example with full training:

```python
from pykeen.datasets import get_dataset
from pykeen.models import ERModel
from pykeen.nn import BiomedicalCURIERepresentation
from pykeen.pipeline import pipeline
import bioontologies

# Generate graph dataset from the Monarch Disease Ontology (MONDO)
obograph = bioontologies.get_obograph_by_prefix("mondo").squeeze(standardize=True)
triples = (edge.as_tuple() for edge in graph.obograph)
triples = [t for t in triples if all(t)]
triples_factory = TriplesFactory.from_labeled_triples(np.array(triples))
dataset = Dataset.from_tf(triples_factory)

entity_representations = BiomedicalCURIERepresentation.from_dataset(dataset=dataset, encoder="transformer")
result = pipeline(
    dataset=dataset,
    model=ERModel,
    model_kwargs=dict(
        interaction="distmult",
        entity_representations=entity_representations,
        relation_representation_kwargs=dict(
            shape=entity_representations.shape,
        ),
    ),
)
```

---------

Co-authored-by: Max Berrendorf &lt;berrendorf@dbs.ifi.lmu.de&gt;
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -259,6 +259,7 @@
     "numpy": ("https://numpy.org/doc/stable", None),
     "optuna": ("https://optuna.readthedocs.io/en/latest", None),
     "pybel": ("https://pybel.readthedocs.io/en/latest/", None),
+    "pyobo": ("https://pyobo.readthedocs.io/en/stable/", None),
     "class_resolver": ("https://class-resolver.readthedocs.io/en/latest/", None),
     "rexmex": ("https://rexmex.readthedocs.io/en/latest/", None),
     "bio2bel": ("https://bio2bel.readthedocs.io/en/latest/", None),
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -109,4 +109,5 @@ Name              Description
 ``tests``         Code needed to run tests. Typically handled with ``tox -e py``
 ``docs``          Building of the documentation
 ``opt_einsum``    Improve performance of :func:`torch.einsum` by replacing with :func:`opt_einsum.contract`
+``biomedicine``   Use of :mod:`pyobo` for lookup of biomedical entity labels
 ================  =========================================================================================
diff --git a/docs/source/tutorial/representations.rst b/docs/source/tutorial/representations.rst
@@ -51,23 +51,18 @@ relations (including inverse relations) as tokens.
 
 Text-based
 ----------
-Text-based representations use, e.g., the entities' (or relations') labels to
-derive representations. To this end, PyKEEN provides a base class
-:class:`pykeen.nn.representation.TextRepresentation` with a configurable
-:class:`pykeen.nn.text.TextEncoder`. As a baseline without external dependencies,
-:class:`pykeen.nn.text.CharacterEmbeddingTextEncoder` encodes the label character-wise,
-with trainable representations for individual characters.
-A more advanced text encoder is given by
-:class:`pykeen.nn.text.TransformerTextEncoder`, which utilizes a
+Text-based representations use the entities' (or relations') labels to
+derive representations. To this end,
+:class:`pykeen.nn.representation.TextRepresentation` uses a
 (pre-trained) transformer model from the :mod:`transformers` library to encode
 the labels. Since the transformer models have been trained on huge corpora
 of text, their text encodings often contain semantic information, i.e.,
 labels with similar semantic meaning get similar representations. While we
 can also benefit from these strong features by just initializing an
 :class:`pykeen.nn.representation.Embedding` with the vectors, e.g., using
 :class:`pykeen.nn.init.LabelBasedInitializer`, the
-:class:`pykeen.nn.representation.TextEncoder` include the
-text encoder model as part of the KGE model, and thus allow fine-tuning
+:class:`pykeen.nn.representation.TextRepresentation` include the
+transformer model as part of the KGE model, and thus allow fine-tuning
 the language model for the KGE task. This is beneficial, e.g., since it
 allows a simple form of obtaining an inductive model, which can make
 predictions for entities not seen during training.
@@ -138,3 +133,18 @@ function, we would get similar scores
 
 As a downside, this will usually substantially increase the
 computational cost of computing triple scores.
+
+Biomedical Entities
+~~~~~~~~~~~~~~~~~~~
+If your dataset is labeled with compact uniform resource identifiers (e.g., CURIEs)
+for biomedical entities like chemicals, proteins, diseases, and pathways, then
+the :class:`pykeen.nn.representation.BiomedicalCURIERepresentation`
+representation can make use of :mod:`pyobo` to look up names (via CURIE) via the
+:func:`pyobo.get_name` function, then encode them using the text encoder.
+
+All biomedical knowledge graphs in PyKEEN (at the time of adding this representation),
+unfortunately do not use CURIEs for referencing biomedical entities. In the future, we hope
+this will change.
+
+To learn more about CURIEs, please take a look at the `Bioregistry <https://bioregistry.io>`_
+and `this blog post on CURIEs <https://cthoyt.com/2021/09/14/curies.html>`_.
diff --git a/setup.cfg b/setup.cfg
@@ -113,6 +113,9 @@ transformers =
 lightning =
     # cf. https://github.com/Lightning-AI/lightning/pull/14117
     pytorch_lightning>=1.7.2
+biomedicine =
+    bioregistry
+    pyobo
 tests =
     unittest-templates>=0.0.5
     coverage
diff --git a/src/pykeen/nn/__init__.py b/src/pykeen/nn/__init__.py
@@ -61,6 +61,8 @@
 )
 from .representation import (
     BackfillRepresentation,
+    BiomedicalCURIERepresentation,
+    CachedTextRepresentation,
     CombinedRepresentation,
     Embedding,
     LowRankRepresentation,
@@ -96,6 +98,7 @@
     "TextRepresentation",
     "TransformedRepresentation",
     "WikidataTextRepresentation",
+    "BiomedicalCURIERepresentation",
     "VisualRepresentation",
     "WikidataVisualRepresentation",
     "tokenizer_resolver",
@@ -154,4 +157,7 @@
 representation_resolver: ClassResolver[Representation] = ClassResolver.from_subclasses(
     base=Representation,
     default=Embedding,
+    skip={
+        CachedTextRepresentation,
+    },
 )
diff --git a/src/pykeen/nn/representation.py b/src/pykeen/nn/representation.py
@@ -10,7 +10,7 @@
 import string
 import warnings
 from abc import ABC, abstractmethod
-from typing import Any, Iterable, List, Mapping, Optional, Sequence, Tuple, Union
+from typing import Any, ClassVar, Iterable, List, Literal, Mapping, Optional, Sequence, Tuple, Type, Union, cast
 
 import more_itertools
 import numpy
@@ -26,7 +26,7 @@
 from .compositions import CompositionModule, composition_resolver
 from .init import initializer_resolver, uniform_norm_p1_
 from .text import TextEncoder, text_encoder_resolver
-from .utils import ShapeError, WikidataCache
+from .utils import PyOBOCache, ShapeError, TextCache, WikidataCache
 from .weighting import EdgeWeighting, SymmetricEdgeWeighting, edge_weight_resolver
 from ..datasets import Dataset
 from ..regularizers import Regularizer, regularizer_resolver
@@ -57,9 +57,11 @@
     "SubsetRepresentation",
     "CombinedRepresentation",
     "TensorTrainRepresentation",
-    "TextRepresentation",
     "TransformedRepresentation",
+    "TextRepresentation",
+    "CachedTextRepresentation",
     "WikidataTextRepresentation",
+    "BiomedicalCURIERepresentation",
     # Utils
     "constrainer_resolver",
     "normalizer_resolver",
@@ -952,6 +954,21 @@ def _plain_forward(
         return x
 
 
+def _clean_labels(labels: Sequence[Optional[str]], missing_action: Literal["error", "blank"]) -> Sequence[str]:
+    if missing_action == "error":
+        idx = [i for i, label in enumerate(labels) if label is None]
+        if idx:
+            raise ValueError(
+                f"The labels at the following indexes were none. "
+                f"Consider an alternate `missing_action` policy.\n{idx}",
+            )
+        return cast(Sequence[str], labels)
+    elif missing_action == "blank":
+        return [label or "" for label in labels]
+    else:
+        raise ValueError(f"Invalid `missing_action` policy: {missing_action}")
+
+
 class TextRepresentation(Representation):
     """
     Textual representations using a text encoder on labels.
@@ -969,7 +986,7 @@ class TextRepresentation(Representation):
 
         dataset = get_dataset(dataset="nations")
         entity_representations = TextRepresentation.from_dataset(
-            triples_factory=dataset,
+            dataset=dataset,
             encoder="transformer",
         )
         model = ERModel(
@@ -983,11 +1000,12 @@ class TextRepresentation(Representation):
 
     def __init__(
         self,
-        labels: Sequence[str],
+        labels: Sequence[Optional[str]],
         max_id: Optional[int] = None,
         shape: Optional[OneOrSequence[int]] = None,
         encoder: HintOrType[TextEncoder] = None,
         encoder_kwargs: OptionalKwargs = None,
+        missing_action: Literal["blank", "error"] = "error",
         **kwargs,
     ):
         """
@@ -1003,6 +1021,9 @@ def __init__(
             the text encoder, or a hint thereof
         :param encoder_kwargs:
             keyword-based parameters used to instantiate the text encoder
+        :param missing_action:
+            Which policy for handling nones in the given labels. If "error", raises an error
+            on any nones. If "blank", replaces nones with an empty string.
         :param kwargs:
             additional keyword-based parameters passed to :meth:`Representation.__init__`
 
@@ -1014,6 +1035,7 @@ def __init__(
         max_id = max_id or len(labels)
         if max_id != len(labels):
             raise ValueError(f"max_id={max_id} does not match len(labels)={len(labels)}")
+        labels = _clean_labels(labels, missing_action)
         # infer shape
         shape = ShapeError.verify(shape=encoder.encode_all(labels[0:1]).shape[1:], reference=shape)
         super().__init__(max_id=max_id, shape=shape, **kwargs)
@@ -1171,7 +1193,28 @@ def _plain_forward(
         return self.combine(combination=self.combination, base=self.base, indices=indices)
 
 
-class WikidataTextRepresentation(TextRepresentation):
+class CachedTextRepresentation(TextRepresentation):
+    """Textual representations for datasets with identifiers that can be looked up with a :class:`TextCache`."""
+
+    cache_cls: ClassVar[Type[TextCache]]
+
+    def __init__(self, identifiers: Sequence[str], **kwargs):
+        """
+        Initialize the representation.
+
+        :param identifiers:
+            the IDs to be resolved by the class, e.g., wikidata IDs. for :class:`WikidataTextRepresentation`,
+            biomedical entities represented as compact URIs (CURIEs) for :class:`BiomedicalCURIERepresentation`
+        :param kwargs:
+            additional keyword-based parameters passed to :meth:`TextRepresentation.__init__`
+        """
+        cache = self.cache_cls()
+        labels = cache.get_texts(identifiers=identifiers)
+        # delegate to super class
+        super().__init__(labels=labels, **kwargs)
+
+
+class WikidataTextRepresentation(CachedTextRepresentation):
     """
     Textual representations for datasets grounded in Wikidata.
 
@@ -1202,24 +1245,50 @@ class WikidataTextRepresentation(TextRepresentation):
         )
     """
 
-    def __init__(self, labels: Sequence[str], **kwargs):
-        """
-        Initialize the representation.
+    cache_cls = WikidataCache
 
-        :param labels:
-            the wikidata IDs.
-        :param kwargs:
-            additional keyword-based parameters passed to :meth:`TextRepresentation.__init__`
-        """
-        # set up cache
-        cache = WikidataCache()
-        # get labels & descriptions
-        titles = cache.get_labels(ids=labels)
-        descriptions = cache.get_descriptions(ids=labels)
-        # compose labels
-        labels = [f"{title}: {description}" for title, description in zip(titles, descriptions)]
-        # delegate to super class
-        super().__init__(labels=labels, **kwargs)
+
+class BiomedicalCURIERepresentation(CachedTextRepresentation):
+    """
+    Textual representations for datasets grounded with biomedical CURIEs.
+
+    The label and description for each entity are obtained via :mod:`pyobo` using
+    :class:`pykeen.nn.utils.PyOBOCache` and encoded with :class:`TextRepresentation`.
+
+    Example usage:
+
+    .. code-block:: python
+
+        from pykeen.datasets import get_dataset
+        from pykeen.models import ERModel
+        from pykeen.nn import BiomedicalCURIERepresentation
+        from pykeen.pipeline import pipeline
+        import bioontologies
+
+        # Generate graph dataset from the Monarch Disease Ontology (MONDO)
+        graph = bioontologies.get_obograph_by_prefix("mondo").squeeze(standardize=True)
+        triples = (edge.as_tuple() for edge in graph.edges)
+        triples = [t for t in triples if all(t)]
+        triples = TriplesFactory.from_labeled_triples(np.array(triples))
+        dataset = Dataset.from_tf(triples)
+
+        entity_representations = BiomedicalCURIERepresentation.from_dataset(
+            dataset=dataset, encoder="transformer",
+        )
+        result = pipeline(
+            dataset=dataset,
+            model=ERModel,
+            model_kwargs=dict(
+                interaction="distmult",
+                entity_representations=entity_representations,
+                relation_representation_kwargs=dict(
+                    shape=entity_representations.shape,
+                ),
+            ),
+        )
+    """
+
+    cache_cls = PyOBOCache
 
 
 class PartitionRepresentation(Representation):
diff --git a/src/pykeen/nn/utils.py b/src/pykeen/nn/utils.py
diff --git a/tests/test_nn/test_representation.py b/tests/test_nn/test_representation.py
diff --git a/tox.ini b/tox.ini

-Original file line number
+Diff line change
 import logging
 import pathlib
 import re
 +import subprocess
 +from abc import ABC, abstractmethod
 from itertools import chain
 from textwrap import dedent
 from typing import Any, Callable, Collection, Dict, Iterable, List, Literal, Mapping, Optional, Sequence, Union, cast
     "safe_diagonal",
     "adjacency_tensor_to_stacked_matrix",
     "use_horizontal_stacking",
 -    "WikidataCache",
     "ShapeError",
 +    # Caches
 +    "TextCache",
 +    "WikidataCache",
 +    "PyOBOCache",
+]
 logger = logging.getLogger(__name__)
+]
 -class WikidataCache:
 +class TextCache(ABC):
 +    """An interface for looking up text for various flavors of entity identifiers."""
++
 +    @abstractmethod
 +    def get_texts(self, identifiers: Sequence[str]) -> Sequence[Optional[str]]:
 +        """Get text for the given identifiers for the cache."""
++
++
 +class WikidataCache(TextCache):
     """A cache for requests against Wikidata's SPARQL endpoint."""
     #: Wikidata SPARQL endpoint. See https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service#Interfacing
             assert isinstance(item, str)
         return cast(Sequence[str], result)
 -    def get_labels(self, ids: Sequence[str]) -> Sequence[str]:
 +    def get_texts(self, identifiers: Sequence[str]) -> Sequence[str]:
 +        """Get a concatenation of the title and description for each Wikidata identifier.
++
 +        :param identifiers:
 +            the Wikidata identifiers, each starting with Q (e.g., ``['Q42']``)
++
 +        :return:
 +            the label and description for each Wikidata entity concatenated
 +        """
 +        # get labels & descriptions
 +        titles = self.get_labels(wikidata_identifiers=identifiers)
 +        descriptions = self.get_descriptions(wikidata_identifiers=identifiers)
 +        # compose labels
 +        return [f"{title}: {description}" for title, description in zip(titles, descriptions)]
++
 +    def get_labels(self, wikidata_identifiers: Sequence[str]) -> Sequence[str]:
         """
         Get entity labels for the given IDs.
 -        :param ids:
 -            the Wikidata IDs
 +        :param wikidata_identifiers:
 +            the Wikidata identifiers, each starting with Q (e.g., ``['Q42']``)
         :return:
             the label for each Wikidata entity
         """
 -        return self._get(ids=ids, component="label")
 +        return self._get(ids=wikidata_identifiers, component="label")
 -    def get_descriptions(self, ids: Sequence[str]) -> Sequence[str]:
 +    def get_descriptions(self, wikidata_identifiers: Sequence[str]) -> Sequence[str]:
         """
         Get entity descriptions for the given IDs.
 -        :param ids:
 -            the Wikidata IDs
 +        :param wikidata_identifiers:
 +            the Wikidata identifiers, each starting with Q (e.g., ``['Q42']``)
         :return:
             the description for each Wikidata entity
         """
 -        return self._get(ids=ids, component="description")
 +        return self._get(ids=wikidata_identifiers, component="description")
     def _discover_images(self, extensions: Collection[str]) -> Mapping[str, pathlib.Path]:
         image_dir = self.module.join("images")
         num_missing = len(missing)
         logger.info(
             f"Downloading images for {num_missing:,} entities. With the rate limit in place, "
 -            f"this will take at least {num_missing/10:.2f} seconds.",
 +            f"this will take at least {num_missing / 10:.2f} seconds.",
+        )
         res_json = self.query(
             sparql=functools.partial(
         return [id_to_path.get(i) for i in ids]
 +PYOBO_PREFIXES_WARNED = set()
++
++
 +class PyOBOCache(TextCache):
 +    """A cache that looks up labels of biomedical entities based on their CURIEs."""
++
 +    def __init__(self, *args, **kwargs):
 +        """Instantiate the PyOBO cache, ensuring PyOBO is installed."""
 +        try:
 +            import pyobo
 +        except ImportError:
 +            raise ImportError(f"Can not use {self.__class__.__name__} because pyobo is not installed.")
 +        else:
 +            self._get_name = pyobo.get_name
 +        super().__init__(*args, **kwargs)
++
 +    def get_texts(self, identifiers: Sequence[str]) -> Sequence[Optional[str]]:
 +        """Get text for the given CURIEs.
++
 +        :param identifiers:
 +            The compact URIs for each entity (e.g., ``['doid:1234', ...]``)
++
 +        :return:
 +            the label for each entity, looked up via :func:`pyobo.get_name`.
 +            Might be none if no label is available.
 +        """
 +        # This import doesn't need a wrapper since it's a transitive
 +        # requirement of PyOBO
 +        import bioregistry
++
 +        res: List[Optional[str]] = []
 +        for curie in identifiers:
 +            try:
 +                prefix, identifier = curie.split(":", maxsplit=1)
 +            except ValueError:
 +                res.append(None)
 +                continue
++
 +            norm_prefix = bioregistry.normalize_prefix(prefix)
 +            if norm_prefix is None:
 +                if prefix not in PYOBO_PREFIXES_WARNED:
 +                    logger.warning("Prefix not registered in the Bioregistry: %s", prefix)
 +                    PYOBO_PREFIXES_WARNED.add(prefix)
 +                res.append(None)
 +                continue
++
 +            try:
 +                name = self._get_name(norm_prefix, identifier)
 +            except subprocess.CalledProcessError:
 +                if norm_prefix not in PYOBO_PREFIXES_WARNED:
 +                    logger.warning("could not get names from %s", norm_prefix)
 +                    PYOBO_PREFIXES_WARNED.add(norm_prefix)
 +                res.append(None)
 +                continue
 +            else:
 +                res.append(name)
 +        return res
++
++
 class ShapeError(ValueError):
     """An error for a mismatch in shapes."""
-Original file line number
+Diff line change
     cls = pykeen.nn.representation.WikidataTextRepresentation
     kwargs = dict(
 -        labels=["Q100", "Q1000"],
 +        identifiers=["Q100", "Q1000"],
         encoder="character-embedding",
+    )
         kwargs = super()._pre_instantiation_hook(kwargs)
         # the representation module infers the max_id from the provided labels
         kwargs.pop("max_id")
 -        self.max_id = len(kwargs["labels"])
 +        self.max_id = len(kwargs["identifiers"])
 +        return kwargs
++
++
 +@needs_packages("pyobo")
 +class BiomedicalCURIERepresentationTests(cases.RepresentationTestCase):
 +    """Tests for biomedical CURIE representations."""
++
 +    cls = pykeen.nn.representation.BiomedicalCURIERepresentation
 +    kwargs = dict(
 +        identifiers=[
 +            "hgnc:12929",  # PCGF2
 +            "hgnc:391",  # AKT1
 +        ],
 +        encoder="character-embedding",
 +    )
++
 +    # docstr-coverage: inherited
 +    def _pre_instantiation_hook(self, kwargs: MutableMapping[str, Any]) -> MutableMapping[str, Any]:  # noqa: D102
 +        kwargs = super()._pre_instantiation_hook(kwargs)
 +        # the representation module infers the max_id from the provided labels
 +        kwargs.pop("max_id")
 +        self.max_id = len(kwargs["identifiers"])
         return kwargs
     base_cls = pykeen.nn.representation.Representation
     base_test = cases.RepresentationTestCase
 -    skip_cls = {mocks.CustomRepresentation, pykeen.nn.pyg.MessagePassingRepresentation}
 +    skip_cls = {
 +        mocks.CustomRepresentation,
 +        pykeen.nn.pyg.MessagePassingRepresentation,
 +        pykeen.nn.CachedTextRepresentation,
 +    }