Skip to content

Commit

Permalink
specs,schedulers: add VolumeMount
Browse files Browse the repository at this point in the history
  • Loading branch information
d4l3k committed Mar 16, 2022
1 parent 354d26e commit e6ac408
Show file tree
Hide file tree
Showing 11 changed files with 243 additions and 36 deletions.
6 changes: 5 additions & 1 deletion docs/source/specs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,14 @@ Run Status
Mounts
--------

.. autofunction:: parse_mounts

.. autoclass:: BindMount
:members:

.. autofunction:: parse_mounts
.. autoclass:: VolumeMount
:members:


Component Linter
-----------------
Expand Down
2 changes: 1 addition & 1 deletion torchx/components/dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def ddp(
max_retries: the number of scheduler retries allowed
rdzv_backend: rendezvous backend (only matters when nnodes > 1)
rdzv_endpoint: rendezvous server endpoint (only matters when nnodes > 1), defaults to rank0 host for schedulers that support it
mounts: the list of mounts to bind mount into the worker environment/container (ex. type=bind,src=/host,dst=/job[,readonly])
mounts: mounts to mount into the worker environment/container (ex. type=<bind/volume>,src=/host,dst=/job[,readonly]). See scheduler documentation for more info.
"""

if (script is None) == (m is None):
Expand Down
42 changes: 34 additions & 8 deletions torchx/schedulers/aws_batch_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@
macros,
runopts,
CfgVal,
BindMount,
VolumeMount,
)
from torchx.workspace.docker_workspace import DockerWorkspace

Expand Down Expand Up @@ -95,14 +97,26 @@ def _role_to_node_properties(idx: int, role: Role) -> Dict[str, object]:
volumes = []
for i, mount in enumerate(role.mounts):
name = f"mount_{i}"
volumes.append(
{
"name": name,
"host": {
"sourcePath": mount.src_path,
},
}
)
if isinstance(mount, BindMount):
volumes.append(
{
"name": name,
"host": {
"sourcePath": mount.src_path,
},
}
)
elif isinstance(mount, VolumeMount):
volumes.append(
{
"name": name,
"efsVolumeConfiguration": {
"fileSystemId": mount.src,
},
}
)
else:
raise TypeError(f"unknown mount type {mount}")
mount_points.append(
{
"containerPath": mount.dst_path,
Expand Down Expand Up @@ -176,6 +190,18 @@ class AWSBatchScheduler(Scheduler, DockerWorkspace):
.. runopts::
class: torchx.schedulers.aws_batch_scheduler.AWSBatchScheduler
**Mounts**
This class supports bind mounting host directories and efs volumes
* bind mount: ``type=bind,src=<host path>,dst=<container path>[,readonly]``
* efs volume: ``type=volume,src=<efs id>,dst=<container path>[,readonly]``
See :py:func:`torchx.specs.parse_mounts` for more info.
For other filesystems such as FSx you can mount them onto the host and bind
mount them into your job: https://aws.amazon.com/premiumsupport/knowledge-center/batch-fsx-lustre-file-system-mount/
**Compatibility**
.. compatibility::
Expand Down
41 changes: 34 additions & 7 deletions torchx/schedulers/docker_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
is_terminal,
macros,
runopts,
BindMount,
VolumeMount,
)
from torchx.workspace.docker_workspace import DockerWorkspace

Expand Down Expand Up @@ -104,6 +106,19 @@ class DockerScheduler(Scheduler, DockerWorkspace):
in a job fails, only that replica will be restarted.
**Config Options**
.. runopts::
class: torchx.schedulers.kubernetes_scheduler.KubernetesScheduler
**Mounts**
This class supports bind mounting directories and named volumes.
* bind mount: ``type=bind,src=<host path>,dst=<container path>[,readonly]``
* named volume: ``type=volume,src=<name>,dst=<container path>[,readonly]``
See :py:func:`torchx.specs.parse_mounts` for more info.
.. compatibility::
type: scheduler
Expand Down Expand Up @@ -192,14 +207,26 @@ def _submit_dryrun(
for role in app.roles:
mounts = []
for mount in role.mounts:
mounts.append(
Mount(
target=mount.dst_path,
source=mount.src_path,
read_only=mount.read_only,
type="bind",
if isinstance(mount, BindMount):
mounts.append(
Mount(
target=mount.dst_path,
source=mount.src_path,
read_only=mount.read_only,
type="bind",
)
)
)
elif isinstance(mount, VolumeMount):
mounts.append(
Mount(
target=mount.dst_path,
source=mount.src,
read_only=mount.read_only,
type="volume",
)
)
else:
raise TypeError(f"unknown mount type {mount}")

for replica_id in range(role.num_replicas):
values = macros.Values(
Expand Down
41 changes: 34 additions & 7 deletions torchx/schedulers/kubernetes_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
SchedulerBackend,
macros,
runopts,
BindMount,
VolumeMount,
)
from torchx.workspace.docker_workspace import DockerWorkspace

Expand Down Expand Up @@ -169,6 +171,7 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
V1VolumeMount,
V1Volume,
V1HostPathVolumeSource,
V1PersistentVolumeClaimVolumeSource,
)

requests = {}
Expand All @@ -190,14 +193,26 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
volume_mounts = []
for i, mount in enumerate(role.mounts):
mount_name = f"mount-{i}"
volumes.append(
V1Volume(
name=mount_name,
host_path=V1HostPathVolumeSource(
path=mount.src_path,
),
if isinstance(mount, BindMount):
volumes.append(
V1Volume(
name=mount_name,
host_path=V1HostPathVolumeSource(
path=mount.src_path,
),
)
)
)
elif isinstance(mount, VolumeMount):
volumes.append(
V1Volume(
name=mount_name,
persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
claim_name=mount.src,
),
)
)
else:
raise TypeError(f"unknown mount type {mount}")
volume_mounts.append(
V1VolumeMount(
name=mount_name,
Expand Down Expand Up @@ -374,6 +389,18 @@ class KubernetesScheduler(Scheduler, DockerWorkspace):
.. runopts::
class: torchx.schedulers.kubernetes_scheduler.KubernetesScheduler
**Mounts**
Mounting external filesystems/volumes is via the HostPath and
PersistentVolumeClaim support.
* hostPath volumes: ``type=bind,src=<host path>,dst=<container path>[,readonly]``
* PersistentVolumeClaim: ``type=volume,src=<claim>,dst=<container path>[,readonly]``
See :py:func:`torchx.specs.parse_mounts` for more info.
External docs: https://kubernetes.io/docs/concepts/storage/persistent-volumes/
**Compatibility**
.. compatibility::
Expand Down
33 changes: 33 additions & 0 deletions torchx/schedulers/test/aws_batch_scheduler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from torchx.schedulers.aws_batch_scheduler import (
create_scheduler,
AWSBatchScheduler,
_role_to_node_properties,
)


Expand Down Expand Up @@ -184,6 +185,38 @@ def test_submit_dryrun(self) -> None:
},
)

def test_volume_mounts(self) -> None:
role = specs.Role(
name="foo",
image="",
mounts=[
specs.VolumeMount(src="efsid", dst_path="/dst", read_only=True),
],
)
props = _role_to_node_properties(0, role)
self.assertEqual(
# pyre-fixme[16]: `object` has no attribute `__getitem__`.
props["container"]["volumes"],
[
{
"name": "mount_0",
"efsVolumeConfiguration": {
"fileSystemId": "efsid",
},
}
],
)
self.assertEqual(
props["container"]["mountPoints"],
[
{
"containerPath": "/dst",
"readOnly": True,
"sourceVolume": "mount_0",
}
],
)

def _mock_scheduler(self) -> AWSBatchScheduler:
scheduler = AWSBatchScheduler(
"test",
Expand Down
19 changes: 18 additions & 1 deletion torchx/schedulers/test/docker_scheduler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,31 @@ def test_submit_dryrun(self) -> None:
source="/tmp",
read_only=True,
type="bind",
)
),
],
},
)
],
)
self.assertEqual(str(info), str(want))

def test_volume_mounts(self) -> None:
app = _test_app()
app.roles[0].mounts = [
specs.VolumeMount(src="name", dst_path="/tmp", read_only=True),
]

info = self.scheduler._submit_dryrun(app, cfg={})
want = [
Mount(
target="/tmp",
source="name",
read_only=True,
type="volume",
),
]
self.assertEqual(info.request.containers[0].kwargs["mounts"], want)

@patch("os.environ", {"FOO_1": "f1", "BAR_1": "b1", "FOOBAR_1": "fb1"})
def test_copy_env(self) -> None:
app = _test_app()
Expand Down
38 changes: 38 additions & 0 deletions torchx/schedulers/test/kubernetes_scheduler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,44 @@ def test_submit_dryrun(self) -> None:
""",
)

def test_volume_mounts(self) -> None:
scheduler = create_scheduler("test")
from kubernetes.client.models import (
V1Volume,
V1VolumeMount,
V1PersistentVolumeClaimVolumeSource,
)

role = specs.Role(
name="foo",
image="",
mounts=[
specs.VolumeMount(src="name", dst_path="/dst", read_only=True),
],
)
pod = role_to_pod("foo", role, service_account="")
self.assertEqual(
pod.spec.volumes,
[
V1Volume(
name="mount-0",
persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
claim_name="name",
),
),
],
)
self.assertEqual(
pod.spec.containers[0].volume_mounts,
[
V1VolumeMount(
name="mount-0",
mount_path="/dst",
read_only=True,
)
],
)

def test_rank0_env(self) -> None:
from kubernetes.client.models import (
V1EnvVar,
Expand Down
1 change: 1 addition & 0 deletions torchx/specs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
AppState,
AppStatus,
BindMount,
VolumeMount,
CfgVal,
InvalidRunConfigException,
MalformedAppHandleException,
Expand Down
Loading

0 comments on commit e6ac408

Please sign in to comment.