From e6ac408fe2a2cc161c3fccb5e6be850a9a96af7b Mon Sep 17 00:00:00 2001 From: Tristan Rice Date: Tue, 15 Mar 2022 16:22:36 -0700 Subject: [PATCH] specs,schedulers: add VolumeMount --- docs/source/specs.rst | 6 ++- torchx/components/dist.py | 2 +- torchx/schedulers/aws_batch_scheduler.py | 42 ++++++++++++---- torchx/schedulers/docker_scheduler.py | 41 +++++++++++++--- torchx/schedulers/kubernetes_scheduler.py | 41 +++++++++++++--- .../test/aws_batch_scheduler_test.py | 33 +++++++++++++ .../schedulers/test/docker_scheduler_test.py | 19 +++++++- .../test/kubernetes_scheduler_test.py | 38 +++++++++++++++ torchx/specs/__init__.py | 1 + torchx/specs/api.py | 48 +++++++++++++++---- torchx/specs/test/api_test.py | 8 ++-- 11 files changed, 243 insertions(+), 36 deletions(-) diff --git a/docs/source/specs.rst b/docs/source/specs.rst index b266aa7b1..2f1b67bdb 100644 --- a/docs/source/specs.rst +++ b/docs/source/specs.rst @@ -67,10 +67,14 @@ Run Status Mounts -------- +.. autofunction:: parse_mounts + .. autoclass:: BindMount :members: -.. autofunction:: parse_mounts +.. autoclass:: VolumeMount + :members: + Component Linter ----------------- diff --git a/torchx/components/dist.py b/torchx/components/dist.py index 5133658db..25ff4c180 100644 --- a/torchx/components/dist.py +++ b/torchx/components/dist.py @@ -172,7 +172,7 @@ def ddp( max_retries: the number of scheduler retries allowed rdzv_backend: rendezvous backend (only matters when nnodes > 1) rdzv_endpoint: rendezvous server endpoint (only matters when nnodes > 1), defaults to rank0 host for schedulers that support it - mounts: the list of mounts to bind mount into the worker environment/container (ex. type=bind,src=/host,dst=/job[,readonly]) + mounts: mounts to mount into the worker environment/container (ex. type=,src=/host,dst=/job[,readonly]). See scheduler documentation for more info. """ if (script is None) == (m is None): diff --git a/torchx/schedulers/aws_batch_scheduler.py b/torchx/schedulers/aws_batch_scheduler.py index bf46f9eee..3738df0c8 100644 --- a/torchx/schedulers/aws_batch_scheduler.py +++ b/torchx/schedulers/aws_batch_scheduler.py @@ -58,6 +58,8 @@ macros, runopts, CfgVal, + BindMount, + VolumeMount, ) from torchx.workspace.docker_workspace import DockerWorkspace @@ -95,14 +97,26 @@ def _role_to_node_properties(idx: int, role: Role) -> Dict[str, object]: volumes = [] for i, mount in enumerate(role.mounts): name = f"mount_{i}" - volumes.append( - { - "name": name, - "host": { - "sourcePath": mount.src_path, - }, - } - ) + if isinstance(mount, BindMount): + volumes.append( + { + "name": name, + "host": { + "sourcePath": mount.src_path, + }, + } + ) + elif isinstance(mount, VolumeMount): + volumes.append( + { + "name": name, + "efsVolumeConfiguration": { + "fileSystemId": mount.src, + }, + } + ) + else: + raise TypeError(f"unknown mount type {mount}") mount_points.append( { "containerPath": mount.dst_path, @@ -176,6 +190,18 @@ class AWSBatchScheduler(Scheduler, DockerWorkspace): .. runopts:: class: torchx.schedulers.aws_batch_scheduler.AWSBatchScheduler + **Mounts** + + This class supports bind mounting host directories and efs volumes + + * bind mount: ``type=bind,src=,dst=[,readonly]`` + * efs volume: ``type=volume,src=,dst=[,readonly]`` + + See :py:func:`torchx.specs.parse_mounts` for more info. + + For other filesystems such as FSx you can mount them onto the host and bind + mount them into your job: https://aws.amazon.com/premiumsupport/knowledge-center/batch-fsx-lustre-file-system-mount/ + **Compatibility** .. compatibility:: diff --git a/torchx/schedulers/docker_scheduler.py b/torchx/schedulers/docker_scheduler.py index c15b3ebbb..31370a6c1 100644 --- a/torchx/schedulers/docker_scheduler.py +++ b/torchx/schedulers/docker_scheduler.py @@ -33,6 +33,8 @@ is_terminal, macros, runopts, + BindMount, + VolumeMount, ) from torchx.workspace.docker_workspace import DockerWorkspace @@ -104,6 +106,19 @@ class DockerScheduler(Scheduler, DockerWorkspace): in a job fails, only that replica will be restarted. + **Config Options** + + .. runopts:: + class: torchx.schedulers.kubernetes_scheduler.KubernetesScheduler + + **Mounts** + + This class supports bind mounting directories and named volumes. + + * bind mount: ``type=bind,src=,dst=[,readonly]`` + * named volume: ``type=volume,src=,dst=[,readonly]`` + + See :py:func:`torchx.specs.parse_mounts` for more info. .. compatibility:: type: scheduler @@ -192,14 +207,26 @@ def _submit_dryrun( for role in app.roles: mounts = [] for mount in role.mounts: - mounts.append( - Mount( - target=mount.dst_path, - source=mount.src_path, - read_only=mount.read_only, - type="bind", + if isinstance(mount, BindMount): + mounts.append( + Mount( + target=mount.dst_path, + source=mount.src_path, + read_only=mount.read_only, + type="bind", + ) ) - ) + elif isinstance(mount, VolumeMount): + mounts.append( + Mount( + target=mount.dst_path, + source=mount.src, + read_only=mount.read_only, + type="volume", + ) + ) + else: + raise TypeError(f"unknown mount type {mount}") for replica_id in range(role.num_replicas): values = macros.Values( diff --git a/torchx/schedulers/kubernetes_scheduler.py b/torchx/schedulers/kubernetes_scheduler.py index 91878d321..accf143ba 100644 --- a/torchx/schedulers/kubernetes_scheduler.py +++ b/torchx/schedulers/kubernetes_scheduler.py @@ -63,6 +63,8 @@ SchedulerBackend, macros, runopts, + BindMount, + VolumeMount, ) from torchx.workspace.docker_workspace import DockerWorkspace @@ -169,6 +171,7 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod V1VolumeMount, V1Volume, V1HostPathVolumeSource, + V1PersistentVolumeClaimVolumeSource, ) requests = {} @@ -190,14 +193,26 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod volume_mounts = [] for i, mount in enumerate(role.mounts): mount_name = f"mount-{i}" - volumes.append( - V1Volume( - name=mount_name, - host_path=V1HostPathVolumeSource( - path=mount.src_path, - ), + if isinstance(mount, BindMount): + volumes.append( + V1Volume( + name=mount_name, + host_path=V1HostPathVolumeSource( + path=mount.src_path, + ), + ) ) - ) + elif isinstance(mount, VolumeMount): + volumes.append( + V1Volume( + name=mount_name, + persistent_volume_claim=V1PersistentVolumeClaimVolumeSource( + claim_name=mount.src, + ), + ) + ) + else: + raise TypeError(f"unknown mount type {mount}") volume_mounts.append( V1VolumeMount( name=mount_name, @@ -374,6 +389,18 @@ class KubernetesScheduler(Scheduler, DockerWorkspace): .. runopts:: class: torchx.schedulers.kubernetes_scheduler.KubernetesScheduler + **Mounts** + + Mounting external filesystems/volumes is via the HostPath and + PersistentVolumeClaim support. + + * hostPath volumes: ``type=bind,src=,dst=[,readonly]`` + * PersistentVolumeClaim: ``type=volume,src=,dst=[,readonly]`` + + See :py:func:`torchx.specs.parse_mounts` for more info. + + External docs: https://kubernetes.io/docs/concepts/storage/persistent-volumes/ + **Compatibility** .. compatibility:: diff --git a/torchx/schedulers/test/aws_batch_scheduler_test.py b/torchx/schedulers/test/aws_batch_scheduler_test.py index 11765cb68..78e7eddaa 100644 --- a/torchx/schedulers/test/aws_batch_scheduler_test.py +++ b/torchx/schedulers/test/aws_batch_scheduler_test.py @@ -14,6 +14,7 @@ from torchx.schedulers.aws_batch_scheduler import ( create_scheduler, AWSBatchScheduler, + _role_to_node_properties, ) @@ -184,6 +185,38 @@ def test_submit_dryrun(self) -> None: }, ) + def test_volume_mounts(self) -> None: + role = specs.Role( + name="foo", + image="", + mounts=[ + specs.VolumeMount(src="efsid", dst_path="/dst", read_only=True), + ], + ) + props = _role_to_node_properties(0, role) + self.assertEqual( + # pyre-fixme[16]: `object` has no attribute `__getitem__`. + props["container"]["volumes"], + [ + { + "name": "mount_0", + "efsVolumeConfiguration": { + "fileSystemId": "efsid", + }, + } + ], + ) + self.assertEqual( + props["container"]["mountPoints"], + [ + { + "containerPath": "/dst", + "readOnly": True, + "sourceVolume": "mount_0", + } + ], + ) + def _mock_scheduler(self) -> AWSBatchScheduler: scheduler = AWSBatchScheduler( "test", diff --git a/torchx/schedulers/test/docker_scheduler_test.py b/torchx/schedulers/test/docker_scheduler_test.py index 4a0fc9871..e8002095c 100644 --- a/torchx/schedulers/test/docker_scheduler_test.py +++ b/torchx/schedulers/test/docker_scheduler_test.py @@ -114,7 +114,7 @@ def test_submit_dryrun(self) -> None: source="/tmp", read_only=True, type="bind", - ) + ), ], }, ) @@ -122,6 +122,23 @@ def test_submit_dryrun(self) -> None: ) self.assertEqual(str(info), str(want)) + def test_volume_mounts(self) -> None: + app = _test_app() + app.roles[0].mounts = [ + specs.VolumeMount(src="name", dst_path="/tmp", read_only=True), + ] + + info = self.scheduler._submit_dryrun(app, cfg={}) + want = [ + Mount( + target="/tmp", + source="name", + read_only=True, + type="volume", + ), + ] + self.assertEqual(info.request.containers[0].kwargs["mounts"], want) + @patch("os.environ", {"FOO_1": "f1", "BAR_1": "b1", "FOOBAR_1": "fb1"}) def test_copy_env(self) -> None: app = _test_app() diff --git a/torchx/schedulers/test/kubernetes_scheduler_test.py b/torchx/schedulers/test/kubernetes_scheduler_test.py index 2b59cbc95..a7d1e4356 100644 --- a/torchx/schedulers/test/kubernetes_scheduler_test.py +++ b/torchx/schedulers/test/kubernetes_scheduler_test.py @@ -283,6 +283,44 @@ def test_submit_dryrun(self) -> None: """, ) + def test_volume_mounts(self) -> None: + scheduler = create_scheduler("test") + from kubernetes.client.models import ( + V1Volume, + V1VolumeMount, + V1PersistentVolumeClaimVolumeSource, + ) + + role = specs.Role( + name="foo", + image="", + mounts=[ + specs.VolumeMount(src="name", dst_path="/dst", read_only=True), + ], + ) + pod = role_to_pod("foo", role, service_account="") + self.assertEqual( + pod.spec.volumes, + [ + V1Volume( + name="mount-0", + persistent_volume_claim=V1PersistentVolumeClaimVolumeSource( + claim_name="name", + ), + ), + ], + ) + self.assertEqual( + pod.spec.containers[0].volume_mounts, + [ + V1VolumeMount( + name="mount-0", + mount_path="/dst", + read_only=True, + ) + ], + ) + def test_rank0_env(self) -> None: from kubernetes.client.models import ( V1EnvVar, diff --git a/torchx/specs/__init__.py b/torchx/specs/__init__.py index 884dfc7c8..80c608c08 100644 --- a/torchx/specs/__init__.py +++ b/torchx/specs/__init__.py @@ -27,6 +27,7 @@ AppState, AppStatus, BindMount, + VolumeMount, CfgVal, InvalidRunConfigException, MalformedAppHandleException, diff --git a/torchx/specs/api.py b/torchx/specs/api.py index 77d5ac5b4..c8f19a3b4 100644 --- a/torchx/specs/api.py +++ b/torchx/specs/api.py @@ -196,6 +196,11 @@ class RetryPolicy(str, Enum): APPLICATION = "APPLICATION" +class MountType(str, Enum): + BIND = "bind" + VOLUME = "volume" + + @dataclass class BindMount: """ @@ -206,7 +211,7 @@ class BindMount: Args: src_path: the path on the host dst_path: the path in the worker environment/container - read_only: whether the bind should be read only + read_only: whether the mount should be read only """ src_path: str @@ -214,6 +219,21 @@ class BindMount: read_only: bool = False +@dataclass +class VolumeMount: + """ + Defines a persistent volume mount to mount into the worker environment. + Args: + src: the name or ID of the volume to mount + dst_path: the path in the worker environment/container + read_only: whether the mount should be read only + """ + + src: str + dst_path: str + read_only: bool = False + + @dataclass class Role: """ @@ -275,7 +295,7 @@ class Role: resource: Resource = NULL_RESOURCE port_map: Dict[str, int] = field(default_factory=dict) metadata: Dict[str, Any] = field(default_factory=dict) - mounts: List[BindMount] = field(default_factory=list) + mounts: List[Union[BindMount, VolumeMount]] = field(default_factory=list) def pre_proc( self, @@ -883,7 +903,7 @@ def from_function( } -def parse_mounts(opts: List[str]) -> List[BindMount]: +def parse_mounts(opts: List[str]) -> List[Union[BindMount, VolumeMount]]: """ parse_mounts parses a list of options into typed mounts following a similar format to Dockers bind mount. @@ -896,6 +916,7 @@ def parse_mounts(opts: List[str]) -> List[BindMount]: Supported types: BindMount: type=bind,src=,dst=[,readonly] + VolumeMount: type=volume,src=,dst=[,readonly] """ mount_opts = [] cur = {} @@ -916,10 +937,21 @@ def parse_mounts(opts: List[str]) -> List[BindMount]: mounts = [] for opts in mount_opts: typ = opts.get("type") - assert typ == "bind", "only bind mounts are currently supported" - mounts.append( - BindMount( - src_path=opts["src"], dst_path=opts["dst"], read_only="readonly" in opts + if typ == MountType.BIND: + mounts.append( + BindMount( + src_path=opts["src"], + dst_path=opts["dst"], + read_only="readonly" in opts, + ) ) - ) + elif typ == MountType.VOLUME: + mounts.append( + VolumeMount( + src=opts["src"], dst_path=opts["dst"], read_only="readonly" in opts + ) + ) + else: + valid = list(str(item.value) for item in MountType) + raise ValueError(f"invalid mount type {repr(typ)}, must be one of {valid}") return mounts diff --git a/torchx/specs/test/api_test.py b/torchx/specs/test/api_test.py index 67b899512..91538dcb9 100644 --- a/torchx/specs/test/api_test.py +++ b/torchx/specs/test/api_test.py @@ -38,6 +38,7 @@ runopts, parse_mounts, BindMount, + VolumeMount, ) @@ -805,15 +806,16 @@ def test_bindmount(self) -> None: "source=foo1", "readonly", "target=dst1", - "type=bind", + "type=volume", "destination=dst2", "source=foo2", + "readonly", ] ), [ BindMount(src_path="foo", dst_path="dst"), BindMount(src_path="foo1", dst_path="dst1", read_only=True), - BindMount(src_path="foo2", dst_path="dst2"), + VolumeMount(src="foo2", dst_path="dst2", read_only=True), ], ) @@ -827,6 +829,6 @@ def test_invalid(self) -> None: with self.assertRaisesRegex(KeyError, "src"): parse_mounts(["type=bind"]) with self.assertRaisesRegex( - AssertionError, "only bind mounts are currently supported" + ValueError, "invalid mount type.*must be one of.*bind" ): parse_mounts(["type=foo"])