Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

specs,schedulers: add VolumeMount #426

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion docs/source/specs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,14 @@ Run Status
Mounts
--------

.. autofunction:: parse_mounts

.. autoclass:: BindMount
:members:

.. autofunction:: parse_mounts
.. autoclass:: VolumeMount
:members:


Component Linter
-----------------
Expand Down
2 changes: 1 addition & 1 deletion torchx/components/dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def ddp(
max_retries: the number of scheduler retries allowed
rdzv_backend: rendezvous backend (only matters when nnodes > 1)
rdzv_endpoint: rendezvous server endpoint (only matters when nnodes > 1), defaults to rank0 host for schedulers that support it
mounts: the list of mounts to bind mount into the worker environment/container (ex. type=bind,src=/host,dst=/job[,readonly])
mounts: mounts to mount into the worker environment/container (ex. type=<bind/volume>,src=/host,dst=/job[,readonly]). See scheduler documentation for more info.
"""

if (script is None) == (m is None):
Expand Down
42 changes: 34 additions & 8 deletions torchx/schedulers/aws_batch_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@
macros,
runopts,
CfgVal,
BindMount,
VolumeMount,
)
from torchx.workspace.docker_workspace import DockerWorkspace

Expand Down Expand Up @@ -95,14 +97,26 @@ def _role_to_node_properties(idx: int, role: Role) -> Dict[str, object]:
volumes = []
for i, mount in enumerate(role.mounts):
name = f"mount_{i}"
volumes.append(
{
"name": name,
"host": {
"sourcePath": mount.src_path,
},
}
)
if isinstance(mount, BindMount):
volumes.append(
{
"name": name,
"host": {
"sourcePath": mount.src_path,
},
}
)
elif isinstance(mount, VolumeMount):
volumes.append(
{
"name": name,
"efsVolumeConfiguration": {
"fileSystemId": mount.src,
},
}
)
else:
raise TypeError(f"unknown mount type {mount}")
mount_points.append(
{
"containerPath": mount.dst_path,
Expand Down Expand Up @@ -176,6 +190,18 @@ class AWSBatchScheduler(Scheduler, DockerWorkspace):
.. runopts::
class: torchx.schedulers.aws_batch_scheduler.AWSBatchScheduler

**Mounts**

This class supports bind mounting host directories and efs volumes

* bind mount: ``type=bind,src=<host path>,dst=<container path>[,readonly]``
* efs volume: ``type=volume,src=<efs id>,dst=<container path>[,readonly]``

See :py:func:`torchx.specs.parse_mounts` for more info.

For other filesystems such as FSx you can mount them onto the host and bind
mount them into your job: https://aws.amazon.com/premiumsupport/knowledge-center/batch-fsx-lustre-file-system-mount/

**Compatibility**

.. compatibility::
Expand Down
41 changes: 34 additions & 7 deletions torchx/schedulers/docker_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
is_terminal,
macros,
runopts,
BindMount,
VolumeMount,
)
from torchx.workspace.docker_workspace import DockerWorkspace

Expand Down Expand Up @@ -104,6 +106,19 @@ class DockerScheduler(Scheduler, DockerWorkspace):
in a job fails, only that replica will be restarted.


**Config Options**

.. runopts::
class: torchx.schedulers.kubernetes_scheduler.KubernetesScheduler

**Mounts**

This class supports bind mounting directories and named volumes.

* bind mount: ``type=bind,src=<host path>,dst=<container path>[,readonly]``
* named volume: ``type=volume,src=<name>,dst=<container path>[,readonly]``

See :py:func:`torchx.specs.parse_mounts` for more info.

.. compatibility::
type: scheduler
Expand Down Expand Up @@ -192,14 +207,26 @@ def _submit_dryrun(
for role in app.roles:
mounts = []
for mount in role.mounts:
mounts.append(
Mount(
target=mount.dst_path,
source=mount.src_path,
read_only=mount.read_only,
type="bind",
if isinstance(mount, BindMount):
mounts.append(
Mount(
target=mount.dst_path,
source=mount.src_path,
read_only=mount.read_only,
type="bind",
)
)
)
elif isinstance(mount, VolumeMount):
mounts.append(
Mount(
target=mount.dst_path,
source=mount.src,
read_only=mount.read_only,
type="volume",
)
)
else:
raise TypeError(f"unknown mount type {mount}")

for replica_id in range(role.num_replicas):
values = macros.Values(
Expand Down
41 changes: 34 additions & 7 deletions torchx/schedulers/kubernetes_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
SchedulerBackend,
macros,
runopts,
BindMount,
VolumeMount,
)
from torchx.workspace.docker_workspace import DockerWorkspace

Expand Down Expand Up @@ -169,6 +171,7 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
V1VolumeMount,
V1Volume,
V1HostPathVolumeSource,
V1PersistentVolumeClaimVolumeSource,
)

requests = {}
Expand All @@ -190,14 +193,26 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
volume_mounts = []
for i, mount in enumerate(role.mounts):
mount_name = f"mount-{i}"
volumes.append(
V1Volume(
name=mount_name,
host_path=V1HostPathVolumeSource(
path=mount.src_path,
),
if isinstance(mount, BindMount):
volumes.append(
V1Volume(
name=mount_name,
host_path=V1HostPathVolumeSource(
path=mount.src_path,
),
)
)
)
elif isinstance(mount, VolumeMount):
volumes.append(
V1Volume(
name=mount_name,
persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
claim_name=mount.src,
),
)
)
else:
raise TypeError(f"unknown mount type {mount}")
volume_mounts.append(
V1VolumeMount(
name=mount_name,
Expand Down Expand Up @@ -374,6 +389,18 @@ class KubernetesScheduler(Scheduler, DockerWorkspace):
.. runopts::
class: torchx.schedulers.kubernetes_scheduler.KubernetesScheduler

**Mounts**

Mounting external filesystems/volumes is via the HostPath and
PersistentVolumeClaim support.

* hostPath volumes: ``type=bind,src=<host path>,dst=<container path>[,readonly]``
* PersistentVolumeClaim: ``type=volume,src=<claim>,dst=<container path>[,readonly]``

See :py:func:`torchx.specs.parse_mounts` for more info.

External docs: https://kubernetes.io/docs/concepts/storage/persistent-volumes/

**Compatibility**

.. compatibility::
Expand Down
33 changes: 33 additions & 0 deletions torchx/schedulers/test/aws_batch_scheduler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from torchx.schedulers.aws_batch_scheduler import (
create_scheduler,
AWSBatchScheduler,
_role_to_node_properties,
)


Expand Down Expand Up @@ -184,6 +185,38 @@ def test_submit_dryrun(self) -> None:
},
)

def test_volume_mounts(self) -> None:
role = specs.Role(
name="foo",
image="",
mounts=[
specs.VolumeMount(src="efsid", dst_path="/dst", read_only=True),
],
)
props = _role_to_node_properties(0, role)
self.assertEqual(
# pyre-fixme[16]: `object` has no attribute `__getitem__`.
props["container"]["volumes"],
[
{
"name": "mount_0",
"efsVolumeConfiguration": {
"fileSystemId": "efsid",
},
}
],
)
self.assertEqual(
props["container"]["mountPoints"],
[
{
"containerPath": "/dst",
"readOnly": True,
"sourceVolume": "mount_0",
}
],
)

def _mock_scheduler(self) -> AWSBatchScheduler:
scheduler = AWSBatchScheduler(
"test",
Expand Down
19 changes: 18 additions & 1 deletion torchx/schedulers/test/docker_scheduler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,31 @@ def test_submit_dryrun(self) -> None:
source="/tmp",
read_only=True,
type="bind",
)
),
],
},
)
],
)
self.assertEqual(str(info), str(want))

def test_volume_mounts(self) -> None:
app = _test_app()
app.roles[0].mounts = [
specs.VolumeMount(src="name", dst_path="/tmp", read_only=True),
]

info = self.scheduler._submit_dryrun(app, cfg={})
want = [
Mount(
target="/tmp",
source="name",
read_only=True,
type="volume",
),
]
self.assertEqual(info.request.containers[0].kwargs["mounts"], want)

@patch("os.environ", {"FOO_1": "f1", "BAR_1": "b1", "FOOBAR_1": "fb1"})
def test_copy_env(self) -> None:
app = _test_app()
Expand Down
38 changes: 38 additions & 0 deletions torchx/schedulers/test/kubernetes_scheduler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,44 @@ def test_submit_dryrun(self) -> None:
""",
)

def test_volume_mounts(self) -> None:
scheduler = create_scheduler("test")
from kubernetes.client.models import (
V1Volume,
V1VolumeMount,
V1PersistentVolumeClaimVolumeSource,
)

role = specs.Role(
name="foo",
image="",
mounts=[
specs.VolumeMount(src="name", dst_path="/dst", read_only=True),
],
)
pod = role_to_pod("foo", role, service_account="")
self.assertEqual(
pod.spec.volumes,
[
V1Volume(
name="mount-0",
persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
claim_name="name",
),
),
],
)
self.assertEqual(
pod.spec.containers[0].volume_mounts,
[
V1VolumeMount(
name="mount-0",
mount_path="/dst",
read_only=True,
)
],
)

def test_rank0_env(self) -> None:
from kubernetes.client.models import (
V1EnvVar,
Expand Down
1 change: 1 addition & 0 deletions torchx/specs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
AppState,
AppStatus,
BindMount,
VolumeMount,
CfgVal,
InvalidRunConfigException,
MalformedAppHandleException,
Expand Down
Loading