feldnerd · Feb 13, 2021
diff --git a/‎.bumpversion.cfg
+1-1 b/‎.bumpversion.cfg
+1-1
diff --git a/‎.github/workflows/tests.yml
+2 b/‎.github/workflows/tests.yml
+2
diff --git a/‎.github/workflows/tests_master.yml
+2 b/‎.github/workflows/tests_master.yml
+2
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎AUTHORS.md
+2 b/‎AUTHORS.md
+2
diff --git a/‎LICENSE
+1-1 b/‎LICENSE
+1-1
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎docs/source/conf.py
+1-1 b/‎docs/source/conf.py
+1-1
diff --git a/‎docs/source/reference/constants.rst
+3 b/‎docs/source/reference/constants.rst
+3
diff --git a/‎docs/source/tutorial/byod.rst
+97-121 b/‎docs/source/tutorial/byod.rst
+97-121
diff --git a/‎docs/source/tutorial/checkpoints.rst
+99-126 b/‎docs/source/tutorial/checkpoints.rst
+99-126
diff --git a/‎docs/source/tutorial/making_predictions.rst
+21-34 b/‎docs/source/tutorial/making_predictions.rst
+21-34
diff --git a/‎src/pykeen/datasets/__init__.py
+7-2 b/‎src/pykeen/datasets/__init__.py
+7-2
diff --git a/‎src/pykeen/datasets/base.py
+13-11 b/‎src/pykeen/datasets/base.py
+13-11
diff --git a/‎src/pykeen/datasets/dbpedia.py
+1-5 b/‎src/pykeen/datasets/dbpedia.py
+1-5
diff --git a/‎src/pykeen/pipeline.py
+7-1 b/‎src/pykeen/pipeline.py
+7-1
diff --git a/‎src/pykeen/templates/README.md
+1-1 b/‎src/pykeen/templates/README.md
+1-1
diff --git a/‎src/pykeen/typing.py
+21-11 b/‎src/pykeen/typing.py
+21-11
diff --git a/‎src/pykeen/version.py
+1-1 b/‎src/pykeen/version.py
+1-1
diff --git a/‎tox.ini
+10 b/‎tox.ini
+10
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.2.0-dev
+current_version = 1.3.0-dev
 commit = True
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(?:-(?P<release>[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?(?:\+(?P<build>[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?
 
@@ -85,6 +85,8 @@ jobs:
         run: tox -e py
       - name: Run slow tests
         run: tox -e integration
+      - name: Run doctests
+        run: tox -e doctests
   windows:
     if: "contains(github.event.head_commit.message, 'Trigger CI')"
     name: Windows
 
@@ -84,6 +84,8 @@ jobs:
         run: tox -e py
       - name: Run slow tests
         run: tox -e integration
+      - name: Run doctests
+        run: tox -e doctests
   windows:
     if: "!contains(github.event.head_commit.message, 'skip ci')"
     name: Windows
 
@@ -117,3 +117,4 @@ docs/source/api/*
 scratch/*
 wandb/*
 mlruns
+doctests/
@@ -16,3 +16,5 @@
 - [Michael Galkin](https://github.com/migalkin)
 - [Felix Hamann](https://github.com/kantholtz)
 - [Sankranti Joshi](https://github.com/sunny1401)
+
+See also: https://github.com/pykeen/pykeen/graphs/contributors
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2019-2020 PyKEEN Project Team
+Copyright (c) 2019-2021 PyKEEN Project Team
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 
@@ -300,7 +300,7 @@ See [CONTRIBUTING.md](/CONTRIBUTING.md) for more information on getting involved
 This project has been supported by several organizations (in alphabetical order):
 
 - [Bayer](https://www.bayer.com/)
-- [Enveda Therapeutics](https://envedatherapeutics.com/)
+- [Enveda Biosciences](https://www.envedabio.com/)
 - [Fraunhofer Institute for Algorithms and Scientific Computing](https://www.scai.fraunhofer.de)
 - [Fraunhofer Institute for Intelligent Analysis and Information Systems](https://www.iais.fraunhofer.de)
 - [Fraunhofer Center for Machine Learning](https://www.cit.fraunhofer.de/de/zentren/maschinelles-lernen.html)
 
@@ -52,7 +52,7 @@
 author = 'PyKEEN Project Team'
 
 # The full version, including alpha/beta/rc tags.
-release = '1.2.0-dev'
+release = '1.3.0-dev'
 
 # The short X.Y version.
 parsed_version = re.match(
 
@@ -2,3 +2,6 @@ Constants
 =========
 .. automodule:: pykeen.constants
     :members:
+
+.. automodule:: pykeen.typing
+    :members:
@@ -1,28 +1,26 @@
 Bring Your Own Data
 ===================
 As an alternative to using a pre-packaged dataset, the training and testing can be set explicitly
-by file path or with instances of :class:`pykeen.triples.TriplesFactory`.
+by file path or with instances of :class:`pykeen.triples.TriplesFactory`. Throughout this
+tutorial, the paths to the training, testing, and validation sets for built-in
+:class:`pykeen.datasets.Nations` will be used as examples.
 
 Pre-stratified Dataset
 ----------------------
 You've got a training and testing file as 3-column TSV files, all ready to go. You're sure that there aren't
 any entities or relations appearing in the testing set that don't appear in the training set. Load them in the
 pipeline like this:
 
-.. code-block:: python
-
-    from pykeen.triples import TriplesFactory
-    from pykeen.pipeline import pipeline
-
-    training_path: str = ...
-    testing_path: str = ...
-
-    result = pipeline(
-        training_triples_factory=training_path,
-        testing_triples_factory=testing_path,
-        model='TransE',
-    )
-    result.save_to_directory('test_pre_stratified_transe')
+>>> from pykeen.triples import TriplesFactory
+>>> from pykeen.pipeline import pipeline
+>>> from pykeen.datasets.nations import NATIONS_TRAIN_PATH, NATIONS_TEST_PATH
+>>> result = pipeline(
+...     training=NATIONS_TRAIN_PATH,
+...     testing=NATIONS_TEST_PATH,
+...     model='TransE',
+...     training_kwargs=dict(num_epochs=5),  # short epochs for testing - you should go higher
+... )
+>>> result.save_to_directory('doctests/test_pre_stratified_transe')
 
 PyKEEN will take care of making sure that the entities are mapped from their labels to appropriate integer
 (technically, 0-dimensional :class:`torch.LongTensor`) indexes and that the different sets of triples
@@ -31,68 +29,54 @@ share the same mapping.
 This is equally applicable for the :func:`pykeen.hpo.hpo_pipeline`, which has a similar interface to
 the :func:`pykeen.pipeline.pipeline` as in:
 
-.. code-block:: python
-
-    from pykeen.triples import TriplesFactory
-    from pykeen.hpo import hpo_pipeline
-
-    training_path: str = ...
-    testing_path: str = ...
-
-    result = hpo_pipeline(
-        n_trials=30,
-        training_triples_factory=training_path,
-        testing_triples_factory=testing_path,
-        model='TransE',
-    )
-    result.save_to_directory('test_hpo_pre_stratified_transe')
+>>> from pykeen.hpo import hpo_pipeline
+>>> from pykeen.datasets.nations import NATIONS_TRAIN_PATH, NATIONS_TEST_PATH, NATIONS_VALIDATE_PATH
+>>> result = hpo_pipeline(
+...     n_trials=3,  # you probably want more than this
+...     training=NATIONS_TRAIN_PATH,
+...     testing=NATIONS_TEST_PATH,
+...     validation=NATIONS_VALIDATE_PATH,
+...     model='TransE',
+...     training_kwargs=dict(num_epochs=5),  # short epochs for testing - you should go higher
+... )
+>>> result.save_to_directory('doctests/test_hpo_pre_stratified_transe')
 
 The remainder of the examples will be for :func:`pykeen.pipeline.pipeline`, but all work exactly the same
 for :func:`pykeen.hpo.hpo_pipeline`.
 
 If you want to add dataset-wide arguments, you can use the ``dataset_kwargs`` argument
 to the :class:`pykeen.pipeline.pipeline` to enable options like ``create_inverse_triples=True``.
 
-.. code-block:: python
-
-    from pykeen.triples import TriplesFactory
-    from pykeen.pipeline import pipeline
-
-    training_path: str = ...
-    testing_path: str = ...
-
-    result = pipeline(
-        training_triples_factory=training_path,
-        testing_triples_factory=testing_path,
-        dataset_kwargs={'create_inverse_triples': True},
-        model='TransE',
-    )
-    result.save_to_directory('test_pre_stratified_transe')
+>>> from pykeen.pipeline import pipeline
+>>> from pykeen.datasets.nations import NATIONS_TRAIN_PATH, NATIONS_TEST_PATH
+>>> result = pipeline(
+...     training=NATIONS_TRAIN_PATH,
+...     testing=NATIONS_TEST_PATH,
+...     dataset_kwargs={'create_inverse_triples': True},
+...     model='TransE',
+...     training_kwargs=dict(num_epochs=5),  # short epochs for testing - you should go higher
+... )
+>>> result.save_to_directory('doctests/test_pre_stratified_transe')
 
 If you want finer control over how the triples are created, for example, if they are not all coming from
 TSV files, you can use the :class:`pykeen.triples.TriplesFactory` interface.
 
-.. code-block:: python
-
-    from pykeen.triples import TriplesFactory
-    from pykeen.pipeline import pipeline
-
-    training_path: str = ...
-    testing_path: str = ...
-
-    training = TriplesFactory(path=training_path)
-    testing = TriplesFactory(
-        path=testing_path,
-        entity_to_id=training.entity_to_id,
-        relation_to_id=training.relation_to_id,
-    )
-
-    result = pipeline(
-        training_triples_factory=training,
-        testing_triples_factory=testing,
-        model='TransE',
-    )
-    pipeline_result.save_to_directory('test_pre_stratified_transe')
+>>> from pykeen.triples import TriplesFactory
+>>> from pykeen.pipeline import pipeline
+>>> from pykeen.datasets.nations import NATIONS_TRAIN_PATH, NATIONS_TEST_PATH
+>>> training = TriplesFactory.from_path(NATIONS_TRAIN_PATH)
+>>> testing = TriplesFactory.from_path(
+...     NATIONS_TEST_PATH,
+...     entity_to_id=training.entity_to_id,
+...     relation_to_id=training.relation_to_id,
+... )
+>>> result = pipeline(
+...     training=training,
+...     testing=testing,
+...     model='TransE',
+...     training_kwargs=dict(num_epochs=5),  # short epochs for testing - you should go higher
+... )
+>>> result.save_to_directory('doctests/test_pre_stratified_transe')
 
 .. warning::
 
@@ -106,31 +90,26 @@ The ``dataset_kwargs`` argument is ignored when passing your own :class:`pykeen.
 sure to include the ``create_inverse_triples=True`` in the instantiation of those classes if that's your
 desired behavior as in:
 
-.. code-block:: python
-
-    from pykeen.triples import TriplesFactory
-    from pykeen.pipeline import pipeline
-
-    training_path: str = ...
-    testing_path: str = ...
-
-    training = TriplesFactory(
-        path=training_path,
-        create_inverse_triples=True,
-    )
-    testing = TriplesFactory(
-        path=testing_path,
-        entity_to_id=training.entity_to_id,
-        relation_to_id=training.relation_to_id,
-        create_inverse_triples=True,
-    )
-
-    result = pipeline(
-        training_triples_factory=training,
-        testing_triples_factory=testing,
-        model='TransE',
-    )
-    result.save_to_directory('test_pre_stratified_transe')
+>>> from pykeen.triples import TriplesFactory
+>>> from pykeen.pipeline import pipeline
+>>> from pykeen.datasets.nations import NATIONS_TRAIN_PATH, NATIONS_TEST_PATH
+>>> training = TriplesFactory.from_path(
+...     NATIONS_TRAIN_PATH,
+...     create_inverse_triples=True,
+... )
+>>> testing = TriplesFactory.from_path(
+...     NATIONS_TEST_PATH,
+...     entity_to_id=training.entity_to_id,
+...     relation_to_id=training.relation_to_id,
+...     create_inverse_triples=True,
+... )
+>>> result = pipeline(
+...     training=training,
+...     testing=testing,
+...     model='TransE',
+...     training_kwargs=dict(num_epochs=5),  # short epochs for testing - you should go higher
+... )
+>>> result.save_to_directory('doctests/test_pre_stratified_transe')
 
 Triples factories can also be instantiated using the ``triples`` keyword argument instead of the ``path`` argument
 if you already have triples loaded in a :class:`numpy.ndarray`.
@@ -141,37 +120,34 @@ It's more realistic your real-world dataset is not already stratified into train
 PyKEEN has you covered with :func:`pykeen.triples.TriplesFactory.split`, which will allow you to create
 a stratified dataset.
 
-.. code-block:: python
-
-    from pykeen.triples import TriplesFactory
-    from pykeen.pipeline import pipeline
-
-    tf = TriplesFactory(path=...)
-    training, testing = tf.split()
-
-    result = pipeline(
-        training_triples_factory=training,
-        testing_triples_factory=testing,
-        model='TransE',
-    )
-    pipeline_result.save_to_directory('test_unstratified_transe')
+>>> from pykeen.triples import TriplesFactory
+>>> from pykeen.pipeline import pipeline
+>>> from pykeen.datasets.nations import NATIONS_TRAIN_PATH
+>>> tf = TriplesFactory.from_path(NATIONS_TRAIN_PATH)
+>>> training, testing = tf.split()
+>>> result = pipeline(
+...     training=training,
+...     testing=testing,
+...     model='TransE',
+...     training_kwargs=dict(num_epochs=5),  # short epochs for testing - you should go higher
+... )
+>>> result.save_to_directory('doctests/test_unstratified_transe')
 
 By default, this is an 80/20 split. If you want to use early stopping, you'll also need a validation set, so
 you should specify the splits:
 
-.. code-block:: python
-
-    from pykeen.triples import TriplesFactory
-    from pykeen.pipeline import pipeline
-
-    tf = TriplesFactory(path=...)
-    training, testing, validation = tf.split([.8, .1, .1])
-
-    result = pipeline(
-        training_triples_factory=training,
-        testing_triples_factory=testing,
-        validation_triples_factory=validation,
-        model='TransE',
-        stopper='early',
-    )
-    pipeline_result.save_to_directory('test_unstratified_stopped_transe')
+>>> from pykeen.triples import TriplesFactory
+>>> from pykeen.pipeline import pipeline
+>>> from pykeen.datasets.nations import NATIONS_TRAIN_PATH
+>>> tf = TriplesFactory.from_path(NATIONS_TRAIN_PATH)
+>>> training, testing, validation = tf.split([.8, .1, .1])
+>>> result = pipeline(
+...     training=training,
+...     testing=testing,
+...     validation=validation,
+...     model='TransE',
+...     stopper='early',
+...     training_kwargs=dict(num_epochs=5),  # short epochs for testing - you should go
+...                                          # higher, especially with early stopper enabled
+... )
+>>> result.save_to_directory('doctests/test_unstratified_stopped_transe')
@@ -17,55 +17,46 @@ Regular Checkpoints
 The tutorial :ref:`first_steps` showed how the :func:`pykeen.pipeline.pipeline` function can be used to set up an entire
 KGEM for training and evaluation in just two lines of code. A slightly extended example is shown below:
 
-.. code-block:: python
-
-    from pykeen.pipeline import pipeline
-
-    pipeline_result = pipeline(
-        dataset='Nations',
-        model='TransE',
-        optimizer='Adam',
-        training_kwargs=dict(
-            num_epochs=1000,
-        ),
-    )
+>>> from pykeen.pipeline import pipeline
+>>> pipeline_result = pipeline(
+...     dataset='Nations',
+...     model='TransE',
+...     optimizer='Adam',
+...     training_kwargs=dict(
+...         num_epochs=1000,
+...     ),
+... )
 
 To enable checkpoints, all you have to do is add a ``checkpoint_name`` argument to the ``training_kwargs``.
 This argument should have the name you would like the checkpoint files saved on your computer to be called.
 
-.. code-block:: python
-
-    from pykeen.pipeline import pipeline
-
-    pipeline_result = pipeline(
-        dataset='Nations',
-        model='TransE',
-        optimizer='Adam',
-        training_kwargs=dict(
-            num_epochs=1000,
-            checkpoint_name='my_checkpoint.pt',
-        ),
-    )
+>>> from pykeen.pipeline import pipeline
+>>> pipeline_result = pipeline(
+...     dataset='Nations',
+...     model='TransE',
+...     optimizer='Adam',
+...     training_kwargs=dict(
+...         num_epochs=1000,
+...         checkpoint_name='my_checkpoint.pt',
+...     ),
+... )
 
 Furthermore, you can set the checkpoint frequency, i.e. how often checkpoints should be saved given in minutes, by
 setting the argument ``checkpoint_frequency`` with an integer. The default frequency is 30 minutes and setting it to
 ``0`` will cause the training loop to save a checkpoint after each epoch.
 Let's look at an example.
 
-.. code-block:: python
-
-    from pykeen.pipeline import pipeline
-
-    pipeline_result = pipeline(
-        dataset='Nations',
-        model='TransE',
-        optimizer='Adam',
-        training_kwargs=dict(
-            num_epochs=1000,
-            checkpoint_name='my_checkpoint.pt',
-            checkpoint_frequency=5,
-        ),
-    )
+>>> from pykeen.pipeline import pipeline
+>>> pipeline_result = pipeline(
+...     dataset='Nations',
+...     model='TransE',
+...     optimizer='Adam',
+...     training_kwargs=dict(
+...         num_epochs=1000,
+...         checkpoint_name='my_checkpoint.pt',
+...         checkpoint_frequency=5,
+...     ),
+... )
 
 Here we have defined a pipeline that will save training loop checkpoints in the checkpoint file called
 ``my_checkpoint.pt`` every time an epoch finishes and at least `5` minutes have passed since saving previously.
@@ -78,20 +69,17 @@ or the early stopper stops it. Assuming that you successfully trained the KGEM a
 that you would like to test the model with `2000` epochs, all you have to do is to change the number of epochs and
 execute the code like:
 
-.. code-block:: python
-
-    from pykeen.pipeline import pipeline
-
-    pipeline_result = pipeline(
-        dataset='Nations',
-        model='TransE',
-        optimizer='Adam',
-        training_kwargs=dict(
-            num_epochs=2000,  # more epochs than before
-            checkpoint_name='my_checkpoint.pt',
-            checkpoint_frequency=5,
-        ),
-    )
+>>> from pykeen.pipeline import pipeline
+>>> pipeline_result = pipeline(
+...     dataset='Nations',
+...     model='TransE',
+...     optimizer='Adam',
+...     training_kwargs=dict(
+...         num_epochs=2000,  # more epochs than before
+...         checkpoint_name='my_checkpoint.pt',
+...         checkpoint_frequency=5,
+...     ),
+... )
 
 The above code will load the saved state after finishing `1000` epochs and continue to train to `2000` epochs, giving
 the exact same results as if you would have run it for `2000` epochs in the first place.
@@ -101,20 +89,17 @@ which is a subdirectory in your home directory, e.g. ``~/.data/pykeen/checkpoint
 Optionally, you can set the path to where you want the checkpoints to be saved by setting the ``checkpoint_directory``
 argument with a string or a :class:`pathlib.Path` object containing your desired root path, as shown in this example:
 
-.. code-block:: python
-
-    from pykeen.pipeline import pipeline
-
-    pipeline_result = pipeline(
-        dataset='Nations',
-        model='TransE',
-        optimizer='Adam',
-        training_kwargs=dict(
-            num_epochs=2000,
-            checkpoint_name='my_checkpoint.pt',
-            checkpoint_directory='/my/secret/dir',
-        ),
-    )
+>>> from pykeen.pipeline import pipeline
+>>> pipeline_result = pipeline(
+...     dataset='Nations',
+...     model='TransE',
+...     optimizer='Adam',
+...     training_kwargs=dict(
+...         num_epochs=2000,
+...         checkpoint_name='my_checkpoint.pt',
+...         checkpoint_directory='doctests/checkpoint_dir',
+...     ),
+... )
 
 .. _failure_checkpoints_how_to:
 
@@ -123,16 +108,16 @@ Checkpoints on Failure
 In cases where you only would like to save checkpoints whenever the training loop might fail, you can use the argument
 ``checkpoint_on_failure=True``, like:
 
-.. code-block:: python
-
-    from pykeen.pipeline import pipeline
-
-    pipeline_result = pipeline(
-        dataset='Nations',
-        model='TransE',
-        optimizer='Adam',
-        training_kwargs=dict(num_epochs=2000, checkpoint_on_failure=True),
-    )
+>>> from pykeen.pipeline import pipeline
+>>> pipeline_result = pipeline(
+...     dataset='Nations',
+...     model='TransE',
+...     optimizer='Adam',
+...     training_kwargs=dict(
+...         num_epochs=2000,
+...         checkpoint_on_failure=True,
+...     ),
+... )
 
 This option differs from regular checkpoints, since regular checkpoints are only saved
 after a successful epoch. When saving checkpoints due to failure of the training loop there is no guarantee that all
@@ -141,19 +126,17 @@ specific training loop. Therefore, these checkpoints are saved with a distinct c
 ``PyKEEN_just_saved_my_day_{datetime}.pt`` in the given ``checkpoint_directory``, even when you also opted to use
 regular checkpoints as defined above, e.g. with this code:
 
-.. code-block:: python
-
-    from pykeen.pipeline import pipeline
-    pipeline_result = pipeline(
-        dataset='Nations',
-        model='TransE',
-        optimizer='Adam',
-        training_kwargs=dict(
-            num_epochs=2000,
-            checkpoint_name='my_checkpoint.pt',
-            checkpoint_on_failure=True,
-        ),
-    )
+>>> from pykeen.pipeline import pipeline
+>>> pipeline_result = pipeline(
+...     dataset='Nations',
+...     model='TransE',
+...     optimizer='Adam',
+...     training_kwargs=dict(
+...         num_epochs=2000,
+...         checkpoint_name='my_checkpoint.pt',
+...         checkpoint_on_failure=True,
+...     ),
+... )
 
 Note: Use this argument with caution, since every failed training loop will create a distinct checkpoint file.
 
@@ -193,21 +176,17 @@ the same compared to running uninterrupted without checkpoints, also for the eva
 
 To show how to use the checkpoint functionality without the pipeline, we define a KGEM first:
 
-.. code-block:: python
-
-    from pykeen.models import TransE
-    from pykeen.training import SLCWATrainingLoop
-    from pykeen.triples import TriplesFactory
-    from torch.optim import Adam
-
-    triples_factory = Nations().training
-    model = TransE(
-        triples_factory=triples_factory,
-        random_seed=123,
-    )
-
-    optimizer = Adam(params=model.get_grad_params())
-    training_loop = SLCWATrainingLoop(model=model, optimizer=optimizer)
+>>> from pykeen.models import TransE
+>>> from pykeen.training import SLCWATrainingLoop
+>>> from pykeen.triples import TriplesFactory
+>>> from torch.optim import Adam
+>>> triples_factory = Nations().training
+>>> model = TransE(
+...     triples_factory=triples_factory,
+...     random_seed=123,
+... )
+>>> optimizer = Adam(params=model.get_grad_params())
+>>> training_loop = SLCWATrainingLoop(model=model, optimizer=optimizer)
 
 At this point we have a model, dataset and optimizer all setup in a training loop and are ready to train the model with
 the ``training_loop``'s method :func:`pykeen.training.TrainingLoop.train`. To enable checkpoints all you have to do is
@@ -222,13 +201,11 @@ argument with a string or a :class:`pathlib.Path` object containing your desired
 
 Here is an example:
 
-.. code-block:: python
-
-    losses = training_loop.train(
-        num_epochs=1000,
-        checkpoint_name='my_checkpoint.pt',
-        checkpoint_frequency=5,
-    )
+>>> losses = training_loop.train(
+...     num_epochs=1000,
+...     checkpoint_name='my_checkpoint.pt',
+...     checkpoint_frequency=5,
+... )
 
 With this code we have started the training loop with the above defined KGEM. The training loop will save a checkpoint
 in the ``my_checkpoint.pt`` file, which will be saved in the ``~/.data/pykeen/checkpoints/`` directory, since we haven't
@@ -249,26 +226,22 @@ E.g. the above training loop finished successfully after 1000 epochs, but you wo
 train the same model from that state for 2000 epochs. All you have have to do is to change the argument
 ``num_epochs`` in the above code to:
 
-.. code-block:: python
-
-    losses = training_loop.train(
-        num_epochs=2000,
-        checkpoint_name='my_checkpoint.pt',
-        checkpoint_frequency=5,
-    )
+>>> losses = training_loop.train(
+...     num_epochs=2000,
+...     checkpoint_name='my_checkpoint.pt',
+...     checkpoint_frequency=5,
+... )
 
 and now the training loop will resume from the state at 1000 epochs and continue to train until 2000 epochs.
 
 As shown in :ref:`failure_checkpoints_how_to`, you can also save checkpoints only in cases where the
 training loop fails. To do this you just have to set the argument `checkpoint_on_failure=True`, like:
 
-.. code-block:: python
-
-    losses = training_loop.train(
-        num_epochs=2000,
-        checkpoint_directory='/my/secret/dir',
-        checkpoint_on_failure=True,
-    )
+>>> losses = training_loop.train(
+...     num_epochs=2000,
+...     checkpoint_directory='/my/secret/dir',
+...     checkpoint_on_failure=True,
+... )
 
 This code will save a checkpoint in case the training loop fails. Note how we also chose a new checkpoint directory by
 setting the `checkpoint_directory` argument to ``/my/secret/dir``.
@@ -26,30 +26,22 @@ This example shows using the :func:`pykeen.pipeline.pipeline` to train a model
 which will already be in memory. Each of the high-level interfaces are exposed through the
 model:
 
-.. code-block:: python
-
-    from pykeen.pipeline import pipeline
-
-    pipeline_result = pipeline(dataset='Nations', model='RotatE')
-    model = pipeline_result.model
-
-    # Predict tails
-    predicted_tails_df = model.get_tail_prediction_df('brazil', 'intergovorgs')
-
-    # Predict relations
-    predicted_relations_df = model.get_relation_prediction_df('brazil', 'uk')
-
-    # Predict heads
-    predicted_heads_df = model.get_head_prediction_df('conferences', 'brazil')
-
-    # Score all triples (memory intensive)
-    predictions_df = model.get_all_prediction_df()
-
-    # Score top K triples
-    predictions_df = model.get_all_prediction_df(k=150)
-
-    # save the model
-    pipeline_result.save_to_directory('nations_rotate')
+>>> from pykeen.pipeline import pipeline
+>>> # Run the pipeline
+>>> pipeline_result = pipeline(dataset='Nations', model='RotatE')
+>>> model = pipeline_result.model
+>>> # Predict tails
+>>> predicted_tails_df = model.get_tail_prediction_df('brazil', 'intergovorgs')
+>>> # Predict relations
+>>> predicted_relations_df = model.get_relation_prediction_df('brazil', 'uk')
+>>> # Predict heads
+>>> predicted_heads_df = model.get_head_prediction_df('conferences', 'brazil')
+>>> # Score all triples (memory intensive)
+>>> predictions_df = model.get_all_prediction_df()
+>>> # Score top K triples
+>>> top_k_predictions_df = model.get_all_prediction_df(k=150)
+>>> # save the model
+>>> pipeline_result.save_to_directory('doctests/nations_rotate')
 
 Loading a Model
 ~~~~~~~~~~~~~~~
@@ -58,16 +50,11 @@ This example shows how to reload a previously trained model. The
 a file named ``trained_model.pkl``, so we will use the one from the
 previous example.
 
-.. code-block:: python
-
-    import torch
-
-    model = torch.load('nations_rotate/trained_model.pkl')
-
-    # Predict tails
-    predicted_tails_df = model.get_tail_prediction_df('brazil', 'intergovorgs')
-
-    # everything else is the same as above
+>>> import torch
+>>> model = torch.load('doctests/nations_rotate/trained_model.pkl')
+>>> # Predict tails
+>>> predicted_tails_df = model.get_tail_prediction_df('brazil', 'intergovorgs')
+>>> # everything else is the same as above
 
 There's an example model available at
 https://github.com/pykeen/pykeen/blob/master/notebooks/hello_world/nations_transe/trained_model.pkl
 
@@ -125,7 +125,7 @@ def get_dataset(
         raise TypeError(f'Dataset is invalid type: {type(dataset)}')
 
     if isinstance(training, str) and isinstance(testing, str):
-        if isinstance(validation, str):
+        if validation is None or isinstance(validation, str):
             return PathDataset(
                 training_path=training,
                 testing_path=testing,
@@ -146,7 +146,12 @@ def get_dataset(
             validation=validation,
         )
 
-    raise TypeError('Training and testing must both be given as strings or Triples Factories')
+    raise TypeError(
+        f'''Training and testing must both be given as strings or Triples Factories.
+        - Training: {type(training)}: {training}
+        - Testing: {type(testing)}: {testing}
+        ''',
+    )
 
 
 def has_dataset(key: str) -> bool:
 
@@ -177,13 +177,12 @@ def testing(self) -> TriplesFactory:  # type:ignore # noqa: D401
         return self._testing
 
     @property
-    def validation(self) -> TriplesFactory:  # type:ignore # noqa: D401
+    def validation(self) -> Optional[TriplesFactory]:  # type:ignore # noqa: D401
         """The validation triples factory that shares indices with the training triples factory."""
         if not self._loaded:
             self._load()
         if not self._loaded_validation:
             self._load_validation()
-        assert self._validation is not None
         return self._validation
 
     @property
@@ -224,7 +223,7 @@ def __init__(
         self,
         training_path: Union[str, TextIO],
         testing_path: Union[str, TextIO],
-        validation_path: Union[str, TextIO],
+        validation_path: Union[None, str, TextIO],
         eager: bool = False,
         create_inverse_triples: bool = False,
         load_triples_kwargs: Optional[Mapping[str, Any]] = None,
@@ -269,14 +268,17 @@ def _load_validation(self) -> None:
         # don't call this function by itself. assumes called through the `validation`
         # property and the _training factory has already been loaded
         assert self._training is not None
-        self._validation = TriplesFactory.from_path(
-            path=self.validation_path,
-            entity_to_id=self._training.entity_to_id,  # share entity index with training
-            relation_to_id=self._training.relation_to_id,  # share relation index with training
-            # do not explicitly create inverse triples for testing; this is handled by the evaluation code
-            create_inverse_triples=False,
-            load_triples_kwargs=self.load_triples_kwargs,
-        )
+        if self.validation_path is None:
+            self._validation = None
+        else:
+            self._validation = TriplesFactory.from_path(
+                path=self.validation_path,
+                entity_to_id=self._training.entity_to_id,  # share entity index with training
+                relation_to_id=self._training.relation_to_id,  # share relation index with training
+                # do not explicitly create inverse triples for testing; this is handled by the evaluation code
+                create_inverse_triples=False,
+                load_triples_kwargs=self.load_triples_kwargs,
+            )
 
     def __repr__(self) -> str:  # noqa: D105
         return (
 
@@ -45,8 +45,4 @@ def __init__(self, create_inverse_triples: bool = False, **kwargs):
 
 
 if __name__ == '__main__':
-    _d = DBpedia50()
-    _d.summarize()
-    print(_d.training.triples[:5])
-    print(_d.testing.triples[:5])
-    print(_d.validation.triples[:5])
+    DBpedia50().summarize()
@@ -174,6 +174,7 @@
 import pickle
 import time
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import Any, Collection, Dict, Iterable, List, Mapping, Optional, Set, Type, Union
 
 import pandas as pd
@@ -423,7 +424,12 @@ def _get_results(self) -> Mapping[str, Any]:
             results['stopper'] = self.stopper.get_summary_dict()
         return results
 
-    def save_to_directory(self, directory: str, save_metadata: bool = True, save_replicates: bool = True) -> None:
+    def save_to_directory(
+        self,
+        directory: Union[str, Path],
+        save_metadata: bool = True,
+        save_replicates: bool = True,
+    ) -> None:
         """Save all artifacts in the given directory."""
         os.makedirs(directory, exist_ok=True)
 
 
@@ -202,7 +202,7 @@ See [CONTRIBUTING.md](/CONTRIBUTING.md) for more information on getting involved
 This project has been supported by several organizations (in alphabetical order):
 
 - [Bayer](https://www.bayer.com/)
-- [Enveda Therapeutics](https://envedatherapeutics.com/)
+- [Enveda Biosciences](https://www.envedabio.com/)
 - [Fraunhofer Institute for Algorithms and Scientific Computing](https://www.scai.fraunhofer.de)
 - [Fraunhofer Institute for Intelligent Analysis and Information Systems](https://www.iais.fraunhofer.de)
 - [Fraunhofer Center for Machine Learning](https://www.cit.fraunhofer.de/de/zentren/maschinelles-lernen.html)
 
@@ -12,18 +12,20 @@
     'Hint',
     'Mutation',
     'OneOrSequence',
-    # Others
+    # Triples
     'LabeledTriples',
     'MappedTriples',
     'EntityMapping',
     'RelationMapping',
+    # Others
+    'DeviceHint',
+    'TorchRandomHint',
+    # Tensor Functions
     'Initializer',
     'Normalizer',
     'Constrainer',
     'cast_constrainer',
-    'InteractionFunction',
-    'DeviceHint',
-    'TorchRandomHint',
+    # Tensors
     'HeadRepresentation',
     'RelationRepresentation',
     'TailRepresentation',
@@ -34,6 +36,7 @@
 
 X = TypeVar('X')
 Hint = Union[None, str, X]
+#: A function that mutates the input and returns a new object of the same type as output
 Mutation = Callable[[X], X]
 OneOrSequence = Union[X, Sequence[X]]
 
@@ -42,25 +45,32 @@
 EntityMapping = Mapping[str, int]
 RelationMapping = Mapping[str, int]
 
-# comment: TypeVar expects none, or at least two super-classes
-TensorType = TypeVar("TensorType", torch.Tensor, torch.FloatTensor)
-InteractionFunction = Callable[[TensorType, TensorType, TensorType], TensorType]
-
-Initializer = Mutation[TensorType]
-Normalizer = Mutation[TensorType]
-Constrainer = Mutation[TensorType]
+#: A function that can be applied to a tensor to initialize it
+Initializer = Mutation[torch.FloatTensor]
+#: A function that can be applied to a tensor to normalize it
+Normalizer = Mutation[torch.FloatTensor]
+#: A function that can be applied to a tensor to constrain it
+Constrainer = Mutation[torch.FloatTensor]
 
 
 def cast_constrainer(f) -> Constrainer:
     """Cast a constrainer function with :func:`typing.cast`."""
     return cast(Constrainer, f)
 
 
+#: A hint for a :class:`torch.device`
 DeviceHint = Hint[torch.device]
+#: A hint for a :class:`torch.Generator`
 TorchRandomHint = Hint[torch.Generator]
 
+#: A type variable for head representations used in :class:`pykeen.models.Model`,
+#: :class:`pykeen.nn.modules.Interaction`, etc.
 HeadRepresentation = TypeVar("HeadRepresentation", bound=OneOrSequence[torch.FloatTensor])
+#: A type variable for relation representations used in :class:`pykeen.models.Model`,
+#: :class:`pykeen.nn.modules.Interaction`, etc.
 RelationRepresentation = TypeVar("RelationRepresentation", bound=OneOrSequence[torch.FloatTensor])
+#: A type variable for tail representations used in :class:`pykeen.models.Model`,
+#: :class:`pykeen.nn.modules.Interaction`, etc.
 TailRepresentation = TypeVar("TailRepresentation", bound=OneOrSequence[torch.FloatTensor])
 
 
 
@@ -11,7 +11,7 @@
     'get_git_hash',
 ]
 
-VERSION = '1.2.0-dev'
+VERSION = '1.3.0-dev'
 
 
 def get_git_hash() -> str:
 
@@ -17,6 +17,7 @@ envlist =
     doc8
     docs
     # the actual tests
+    doctests
     py
     integration
     # always keep coverage-report last
@@ -49,6 +50,15 @@ deps =
 extras =
     mlflow
 
+[testenv:doctests]
+commands =
+    # TODO make this automatic for all RST in a loop (but not using xargs since doctest uses multiprocessing)
+    python -m doctest docs/source/tutorial/first_steps.rst
+    python -m doctest docs/source/tutorial/byod.rst
+    python -m doctest docs/source/tutorial/making_predictions.rst
+    # python -m doctest src/pykeen/pipeline.py
+    # python -m doctest src/pykeen/hpo/__init__.py
+
 [testenv:coverage-clean]
 deps = coverage
 skip_install = true
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`	`'get_git_hash',`
`12`	`12`	`]`
`13`	`13`
`14`		`-VERSION = '1.2.0-dev'`
	`14`	`+VERSION = '1.3.0-dev'`
`15`	`15`
`16`	`16`
`17`	`17`	`def get_git_hash() -> str:`