pytorch · d4l3k · Mar 15, 2022
diff --git a/docs/source/specs.rst b/docs/source/specs.rst
@@ -67,10 +67,14 @@ Run Status
 Mounts
 --------
 
+.. autofunction:: parse_mounts
+
 .. autoclass:: BindMount
    :members:
 
-.. autofunction:: parse_mounts
+.. autoclass:: VolumeMount
+   :members:
+
 
 Component Linter
 -----------------

diff --git a/torchx/components/dist.py b/torchx/components/dist.py
@@ -172,7 +172,7 @@ def ddp(
         max_retries: the number of scheduler retries allowed
         rdzv_backend: rendezvous backend (only matters when nnodes > 1)
         rdzv_endpoint: rendezvous server endpoint (only matters when nnodes > 1), defaults to rank0 host for schedulers that support it
-        mounts: the list of mounts to bind mount into the worker environment/container (ex. type=bind,src=/host,dst=/job[,readonly])
+        mounts: mounts to mount into the worker environment/container (ex. type=<bind/volume>,src=/host,dst=/job[,readonly]). See scheduler documentation for more info.
     """
 
     if (script is None) == (m is None):

diff --git a/torchx/schedulers/aws_batch_scheduler.py b/torchx/schedulers/aws_batch_scheduler.py
@@ -58,6 +58,8 @@
     macros,
     runopts,
     CfgVal,
+    BindMount,
+    VolumeMount,
 )
 from torchx.workspace.docker_workspace import DockerWorkspace
 
@@ -95,14 +97,26 @@ def _role_to_node_properties(idx: int, role: Role) -> Dict[str, object]:
     volumes = []
     for i, mount in enumerate(role.mounts):
         name = f"mount_{i}"
-        volumes.append(
-            {
-                "name": name,
-                "host": {
-                    "sourcePath": mount.src_path,
-                },
-            }
-        )
+        if isinstance(mount, BindMount):
+            volumes.append(
+                {
+                    "name": name,
+                    "host": {
+                        "sourcePath": mount.src_path,
+                    },
+                }
+            )
+        elif isinstance(mount, VolumeMount):
+            volumes.append(
+                {
+                    "name": name,
+                    "efsVolumeConfiguration": {
+                        "fileSystemId": mount.src,
+                    },
+                }
+            )
+        else:
+            raise TypeError(f"unknown mount type {mount}")
         mount_points.append(
             {
                 "containerPath": mount.dst_path,
@@ -176,6 +190,18 @@ class AWSBatchScheduler(Scheduler, DockerWorkspace):
     .. runopts::
         class: torchx.schedulers.aws_batch_scheduler.AWSBatchScheduler
 
+    **Mounts**
+
+    This class supports bind mounting host directories and efs volumes
+
+    * bind mount: ``type=bind,src=<host path>,dst=<container path>[,readonly]``
+    * efs volume: ``type=volume,src=<efs id>,dst=<container path>[,readonly]``
+
+    See :py:func:`torchx.specs.parse_mounts` for more info.
+
+    For other filesystems such as FSx you can mount them onto the host and bind
+    mount them into your job: https://aws.amazon.com/premiumsupport/knowledge-center/batch-fsx-lustre-file-system-mount/
+
     **Compatibility**
 
     .. compatibility::

diff --git a/torchx/schedulers/docker_scheduler.py b/torchx/schedulers/docker_scheduler.py
@@ -33,6 +33,8 @@
     is_terminal,
     macros,
     runopts,
+    BindMount,
+    VolumeMount,
 )
 from torchx.workspace.docker_workspace import DockerWorkspace
 
@@ -104,6 +106,19 @@ class DockerScheduler(Scheduler, DockerWorkspace):
               in a job fails, only that replica will be restarted.
 
 
+    **Config Options**
+
+    .. runopts::
+        class: torchx.schedulers.kubernetes_scheduler.KubernetesScheduler
+
+    **Mounts**
+
+    This class supports bind mounting directories and named volumes.
+
+    * bind mount: ``type=bind,src=<host path>,dst=<container path>[,readonly]``
+    * named volume: ``type=volume,src=<name>,dst=<container path>[,readonly]``
+
+    See :py:func:`torchx.specs.parse_mounts` for more info.
 
     .. compatibility::
         type: scheduler
@@ -192,14 +207,26 @@ def _submit_dryrun(
         for role in app.roles:
             mounts = []
             for mount in role.mounts:
-                mounts.append(
-                    Mount(
-                        target=mount.dst_path,
-                        source=mount.src_path,
-                        read_only=mount.read_only,
-                        type="bind",
+                if isinstance(mount, BindMount):
+                    mounts.append(
+                        Mount(
+                            target=mount.dst_path,
+                            source=mount.src_path,
+                            read_only=mount.read_only,
+                            type="bind",
+                        )
                     )
-                )
+                elif isinstance(mount, VolumeMount):
+                    mounts.append(
+                        Mount(
+                            target=mount.dst_path,
+                            source=mount.src,
+                            read_only=mount.read_only,
+                            type="volume",
+                        )
+                    )
+                else:
+                    raise TypeError(f"unknown mount type {mount}")
 
             for replica_id in range(role.num_replicas):
                 values = macros.Values(

diff --git a/torchx/schedulers/kubernetes_scheduler.py b/torchx/schedulers/kubernetes_scheduler.py
@@ -63,6 +63,8 @@
     SchedulerBackend,
     macros,
     runopts,
+    BindMount,
+    VolumeMount,
 )
 from torchx.workspace.docker_workspace import DockerWorkspace
 
@@ -169,6 +171,7 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
         V1VolumeMount,
         V1Volume,
         V1HostPathVolumeSource,
+        V1PersistentVolumeClaimVolumeSource,
     )
 
     requests = {}
@@ -190,14 +193,26 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
     volume_mounts = []
     for i, mount in enumerate(role.mounts):
         mount_name = f"mount-{i}"
-        volumes.append(
-            V1Volume(
-                name=mount_name,
-                host_path=V1HostPathVolumeSource(
-                    path=mount.src_path,
-                ),
+        if isinstance(mount, BindMount):
+            volumes.append(
+                V1Volume(
+                    name=mount_name,
+                    host_path=V1HostPathVolumeSource(
+                        path=mount.src_path,
+                    ),
+                )
             )
-        )
+        elif isinstance(mount, VolumeMount):
+            volumes.append(
+                V1Volume(
+                    name=mount_name,
+                    persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
+                        claim_name=mount.src,
+                    ),
+                )
+            )
+        else:
+            raise TypeError(f"unknown mount type {mount}")
         volume_mounts.append(
             V1VolumeMount(
                 name=mount_name,
@@ -374,6 +389,18 @@ class KubernetesScheduler(Scheduler, DockerWorkspace):
     .. runopts::
         class: torchx.schedulers.kubernetes_scheduler.KubernetesScheduler
 
+    **Mounts**
+
+    Mounting external filesystems/volumes is via the HostPath and
+    PersistentVolumeClaim support.
+
+    * hostPath volumes: ``type=bind,src=<host path>,dst=<container path>[,readonly]``
+    * PersistentVolumeClaim: ``type=volume,src=<claim>,dst=<container path>[,readonly]``
+
+    See :py:func:`torchx.specs.parse_mounts` for more info.
+
+    External docs: https://kubernetes.io/docs/concepts/storage/persistent-volumes/
+
     **Compatibility**
 
     .. compatibility::

diff --git a/torchx/schedulers/test/aws_batch_scheduler_test.py b/torchx/schedulers/test/aws_batch_scheduler_test.py
@@ -14,6 +14,7 @@
 from torchx.schedulers.aws_batch_scheduler import (
     create_scheduler,
     AWSBatchScheduler,
+    _role_to_node_properties,
 )
 
 
@@ -184,6 +185,38 @@ def test_submit_dryrun(self) -> None:
             },
         )
 
+    def test_volume_mounts(self) -> None:
+        role = specs.Role(
+            name="foo",
+            image="",
+            mounts=[
+                specs.VolumeMount(src="efsid", dst_path="/dst", read_only=True),
+            ],
+        )
+        props = _role_to_node_properties(0, role)
+        self.assertEqual(
+            # pyre-fixme[16]: `object` has no attribute `__getitem__`.
+            props["container"]["volumes"],
+            [
+                {
+                    "name": "mount_0",
+                    "efsVolumeConfiguration": {
+                        "fileSystemId": "efsid",
+                    },
+                }
+            ],
+        )
+        self.assertEqual(
+            props["container"]["mountPoints"],
+            [
+                {
+                    "containerPath": "/dst",
+                    "readOnly": True,
+                    "sourceVolume": "mount_0",
+                }
+            ],
+        )
+
     def _mock_scheduler(self) -> AWSBatchScheduler:
         scheduler = AWSBatchScheduler(
             "test",

diff --git a/torchx/schedulers/test/docker_scheduler_test.py b/torchx/schedulers/test/docker_scheduler_test.py
@@ -114,14 +114,31 @@ def test_submit_dryrun(self) -> None:
                                 source="/tmp",
                                 read_only=True,
                                 type="bind",
-                            )
+                            ),
                         ],
                     },
                 )
             ],
         )
         self.assertEqual(str(info), str(want))
 
+    def test_volume_mounts(self) -> None:
+        app = _test_app()
+        app.roles[0].mounts = [
+            specs.VolumeMount(src="name", dst_path="/tmp", read_only=True),
+        ]
+
+        info = self.scheduler._submit_dryrun(app, cfg={})
+        want = [
+            Mount(
+                target="/tmp",
+                source="name",
+                read_only=True,
+                type="volume",
+            ),
+        ]
+        self.assertEqual(info.request.containers[0].kwargs["mounts"], want)
+
     @patch("os.environ", {"FOO_1": "f1", "BAR_1": "b1", "FOOBAR_1": "fb1"})
     def test_copy_env(self) -> None:
         app = _test_app()

diff --git a/torchx/schedulers/test/kubernetes_scheduler_test.py b/torchx/schedulers/test/kubernetes_scheduler_test.py
@@ -283,6 +283,44 @@ def test_submit_dryrun(self) -> None:
 """,
         )
 
+    def test_volume_mounts(self) -> None:
+        scheduler = create_scheduler("test")
+        from kubernetes.client.models import (
+            V1Volume,
+            V1VolumeMount,
+            V1PersistentVolumeClaimVolumeSource,
+        )
+
+        role = specs.Role(
+            name="foo",
+            image="",
+            mounts=[
+                specs.VolumeMount(src="name", dst_path="/dst", read_only=True),
+            ],
+        )
+        pod = role_to_pod("foo", role, service_account="")
+        self.assertEqual(
+            pod.spec.volumes,
+            [
+                V1Volume(
+                    name="mount-0",
+                    persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
+                        claim_name="name",
+                    ),
+                ),
+            ],
+        )
+        self.assertEqual(
+            pod.spec.containers[0].volume_mounts,
+            [
+                V1VolumeMount(
+                    name="mount-0",
+                    mount_path="/dst",
+                    read_only=True,
+                )
+            ],
+        )
+
     def test_rank0_env(self) -> None:
         from kubernetes.client.models import (
             V1EnvVar,

diff --git a/torchx/specs/__init__.py b/torchx/specs/__init__.py
@@ -27,6 +27,7 @@
     AppState,
     AppStatus,
     BindMount,
+    VolumeMount,
     CfgVal,
     InvalidRunConfigException,
     MalformedAppHandleException,