From 304424737ab6d9d92d337c93f6fc41ceb0639934 Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Fri, 17 Nov 2023 08:37:15 +0900 Subject: [PATCH 01/66] ip --- SANGREADME.md | 66 ++++++++++++ a.py | 30 ++++++ python/ray/dag/__init__.py | 2 + python/ray/dag/dag_node.py | 2 +- python/ray/dag/output_node.py | 49 +++++++++ python/ray/dag/tests/test_accelerator_dag.py | 102 +++++++++++++++++++ python/ray/dag/utils.py | 3 + python/ray/serve/tests/common/test_dags.py | 1 + 8 files changed, 254 insertions(+), 1 deletion(-) create mode 100644 SANGREADME.md create mode 100644 a.py create mode 100644 python/ray/dag/output_node.py create mode 100644 python/ray/dag/tests/test_accelerator_dag.py diff --git a/SANGREADME.md b/SANGREADME.md new file mode 100644 index 0000000000000..f7283cb495891 --- /dev/null +++ b/SANGREADME.md @@ -0,0 +1,66 @@ +Actor.bind would kill actors unless I cache the refs. We should fix it. +When actor calls are binded with actor.method.bind, it doesn't create a new DAG, but it append binded methods to existing DAG. + +Worker -> method1 + -> method 2 + +Instead of 2 dags with + +method1 +method 2 + +Only 1 input node is possible with current DAG API. + +Serve: Got around the first issue because all actors are detached. +Not sure how it got around the second case. Maybe it never need to handle this case. + +Example: + +worker = Worker.bind() +dag = worker.method.bind() +dag2 = worker.method_2.bind() + +This will become + +worker -> method -> method2 + +not + +worker -> method +worker -> method_2 + + +VLLM + +init_worker +init torch distributed +init_model +profile_num_available_blocks +init_cache_engine + +forward + +Q: +- How much existing DAG will be used? Are we going to implement our own DAG APIs? (I believe so?) +- What's the work needed to make .remote work with actors? + - Is actor creation supposed to be a part of DAG? +- How the current shared memory based transport feature will be exposed to API? +- How do we handle different size input for different object ref? (the remaining bytes are just becoming garbages?) +- e2e flow + - InputNode creates the first buffer (object_ref) that could be reused. + - Each bind method reuses the buffer. + - If actor is reused. + - Use the first buffer created? We can only have 1 input node anyway now. +- Iterable DAG -> is it just a repeat of execute? + +TODO +- Curerntly, any bind from actor will become a huge single DAG starting from actor. + - Need to find a way to exclude ClassNode from DAG execution. +- Only one input node is possible for a single actor. But input node can have multiple inputs + - Maybe we should allow multiple input node for a single actor (and use it as a starting point). + - Not needed now. +- No way to keep the actor alive. + - There's private argument _ray_cache_ref, but it will cache all refs which is not desirable. + - New API in the part of bind. + +1 DAG can only have 1 input Node diff --git a/a.py b/a.py new file mode 100644 index 0000000000000..fe346dda46c8a --- /dev/null +++ b/a.py @@ -0,0 +1,30 @@ +import ray +from ray.dag.vis_utils import plot +ray.init() + +from ray.dag.input_node import InputNode + +@ray.remote +def a(user_input): + return user_input * 2 + +@ray.remote +def b(user_input): + return user_input + 1 + +@ray.remote +def c(x, y): + return x + y + +with InputNode() as dag_input: + a_ref = a.bind(dag_input) + b_ref = b.bind(dag_input) + dag = c.bind(a_ref, b_ref) + +# a(2) + b(2) = c +# (2 * 2) + (2 * 1) +assert ray.get(dag.execute(2)) == 7 + +# a(3) + b(3) = c +# (3 * 2) + (3 * 1) +assert ray.get(dag.execute(3)) == 10 diff --git a/python/ray/dag/__init__.py b/python/ray/dag/__init__.py index 109b09f125946..985db41be0732 100644 --- a/python/ray/dag/__init__.py +++ b/python/ray/dag/__init__.py @@ -6,6 +6,7 @@ InputAttributeNode, DAGInputData, ) +from ray.dag.output_node import OutputNode from ray.dag.constants import ( PARENT_CLASS_NODE_KEY, PREV_CLASS_METHOD_CALL_KEY, @@ -25,4 +26,5 @@ "PREV_CLASS_METHOD_CALL_KEY", "DAGNODE_TYPE_KEY", "plot", + "OutputNode", ] diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py index 6408f92de15fd..6c20db8c15124 100644 --- a/python/ray/dag/dag_node.py +++ b/python/ray/dag/dag_node.py @@ -294,7 +294,7 @@ def apply_functional( return replaced_inputs - def _execute_impl(self) -> Union[ray.ObjectRef, ray.actor.ActorHandle]: + def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, ray.actor.ActorHandle]: """Execute this node, assuming args have been transformed already.""" raise NotImplementedError diff --git a/python/ray/dag/output_node.py b/python/ray/dag/output_node.py new file mode 100644 index 0000000000000..c98440a274d62 --- /dev/null +++ b/python/ray/dag/output_node.py @@ -0,0 +1,49 @@ +import ray +from typing import Any, Dict, List, Union, Tuple + +from ray.dag import DAGNode +from ray.dag.format_utils import get_dag_node_str +from ray.experimental.gradio_utils import type_to_string +from ray.util.annotations import Deprecated + +IN_CONTEXT_MANAGER = "__in_context_manager__" + + +class OutputNode(DAGNode): + r"""Ray dag node used in DAG building API to mark the endpoint of DAG + """ + + def __init__( + self, + args: Union[DAGNode, List[DAGNode], Tuple[DAGNode]], + other_args_to_resolve: Dict[str, Any] = None, + ): + if isinstance(args, tuple): + args = list(args) + if not isinstance(args, list): + args = (args,) + super().__init__( + args, + {}, + {}, + other_args_to_resolve=other_args_to_resolve or {}, + ) + + def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, ray.actor.ActorHandle]: + if len(self._bound_args) == 1: + return self._bound_args[0] + else: + return self._bound_args + + def _copy_impl( + self, + new_args: List[Any], + new_kwargs: Dict[str, Any], + new_options: Dict[str, Any], + new_other_args_to_resolve: Dict[str, Any], + ) -> "DAGNode": + """Return a copy of this node with the given new args.""" + return OutputNode(new_args, new_other_args_to_resolve) + + def __str__(self) -> str: + return get_dag_node_str(self, "__OutputNode__") diff --git a/python/ray/dag/tests/test_accelerator_dag.py b/python/ray/dag/tests/test_accelerator_dag.py new file mode 100644 index 0000000000000..cf32dcd00852e --- /dev/null +++ b/python/ray/dag/tests/test_accelerator_dag.py @@ -0,0 +1,102 @@ +import pytest + +import ray +from ray.dag.input_node import InputNode +from ray.dag.output_node import OutputNode +from ray.dag import ( + PARENT_CLASS_NODE_KEY, + PREV_CLASS_METHOD_CALL_KEY, +) +from ray.dag.vis_utils import plot + +def test_output_node(shared_ray_instance): + @ray.remote + def f(input): + return input + + with InputNode() as input_data: + dag = OutputNode(f.bind(input_data)) + + assert ray.get(dag.execute(1)) == 1 + assert ray.get(dag.execute(2)) == 2 + + with InputNode() as input_data: + dag = OutputNode([f.bind(input_data["x"]), f.bind(input_data["y"])]) + + refs = dag.execute({"x": 1, "y": 2}) + assert len(refs) == 2 + assert ray.get(refs) == [1, 2] + + with InputNode() as input_data: + dag = OutputNode([ + f.bind(input_data["x"]), + f.bind(input_data["y"]), + f.bind(input_data["x"]) + ]) + + refs = dag.execute({"x": 1, "y": 2}) + assert len(refs) == 3 + assert ray.get(refs) == [1, 2, 1] + + +def test_a(shared_ray_instance): + @ray.remote + class Worker: + def __init__(self): + pass + + def forward(self, input): + print("forward") + + def initialize(self, input): + print("initialize") + + worker = Worker.bind() + with InputNode() as input_node: + dag1 = worker.initialize.bind(input_node) + with InputNode() as input_node: + dag2 = worker.forward.bind(input_node) + + print(ray.get(dag2.execute(1))) + + # plot(dag1, to_file="a.png") + # plot(dag2, to_file="b.png") + + +def test_tensor_parallel_dag(shared_ray_instance): + @ray.remote + class Worker: + def __init__(self, rank): + self.rank = rank + + def forward(self, input_data: int): + print(input_data) + return self.rank + input_data + + def initialize(self): + pass + + with InputNode() as input_data: + workers = [Worker.bind(i) for i in range(4)] + dag = OutputNode( + [worker.forward.bind(input_data) for worker in workers]) + init_dag = OutputNode( + [worker.initialize.bind() for worker in workers]) + + # for _ in range(1): + # refs = dag.execute(2, _ray_cache_refs=True) + # assert len(refs) == 4 + # all_outputs = ray.get(refs) + # assert all_outputs == [2, 3, 4, 5] + + plot(init_dag, to_file="a.png") + plot(dag, to_file="b.png") + # ray.get(init_dag.execute(_ray_cache_refs=True)) + import time + time.sleep(30) + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/dag/utils.py b/python/ray/dag/utils.py index fe9ae35168806..3da8570027dbd 100644 --- a/python/ray/dag/utils.py +++ b/python/ray/dag/utils.py @@ -7,6 +7,7 @@ FunctionNode, ClassNode, ClassMethodNode, + OutputNode, ) @@ -22,6 +23,8 @@ def __init__(self): def get_node_name(self, node: DAGNode): # InputNode should be unique. if isinstance(node, InputNode): + return "OUTPUT_NODE" + if isinstance(node, OutputNode): return "INPUT_NODE" # InputAttributeNode suffixes should match the user-defined key. elif isinstance(node, InputAttributeNode): diff --git a/python/ray/serve/tests/common/test_dags.py b/python/ray/serve/tests/common/test_dags.py index ddccf41c5e3a9..8bd7bcbc78912 100644 --- a/python/ray/serve/tests/common/test_dags.py +++ b/python/ray/serve/tests/common/test_dags.py @@ -60,3 +60,4 @@ def get_multi_instantiation_class_nested_deployment_arg_dag(): ray_dag = combine.__call__.bind(dag_input) return ray_dag, dag_input + From 8c5efd8ac3ad421b3eabfe94c1cf69942a9d7502 Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Fri, 17 Nov 2023 15:52:25 +0900 Subject: [PATCH 02/66] basic working. --- SANGREADME.md | 7 +- python/ray/actor.py | 46 +++++++++++- python/ray/dag/__init__.py | 2 - python/ray/dag/class_node.py | 33 ++++----- python/ray/dag/constants.py | 1 - python/ray/dag/dag_node.py | 4 +- python/ray/dag/output_node.py | 2 +- python/ray/dag/tests/test_accelerator_dag.py | 74 +++++++++++++------- python/ray/dag/tests/test_class_dag.py | 27 +------ python/ray/dag/utils.py | 4 +- 10 files changed, 113 insertions(+), 87 deletions(-) diff --git a/SANGREADME.md b/SANGREADME.md index f7283cb495891..4f3a740a0ea72 100644 --- a/SANGREADME.md +++ b/SANGREADME.md @@ -54,12 +54,11 @@ Q: - Iterable DAG -> is it just a repeat of execute? TODO -- Curerntly, any bind from actor will become a huge single DAG starting from actor. +- [done] Curerntly, any bind from actor will become a huge single DAG starting from actor. - Need to find a way to exclude ClassNode from DAG execution. -- Only one input node is possible for a single actor. But input node can have multiple inputs +- [done] Only one input node is possible for a single actor. But input node can have multiple inputs - Maybe we should allow multiple input node for a single actor (and use it as a starting point). - - Not needed now. -- No way to keep the actor alive. +- [done] No way to keep the actor alive. - There's private argument _ray_cache_ref, but it will cache all refs which is not desirable. - New API in the part of bind. diff --git a/python/ray/actor.py b/python/ray/actor.py index a6adfd5e862a7..3019cbe47ca53 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -7,6 +7,7 @@ import ray._private.signature as signature import ray._private.worker import ray._raylet +from ray.dag.class_node import ClassMethodNode, PARENT_CLASS_NODE_KEY from ray import ActorClassID, Language, cross_language from ray._private import ray_option_utils from ray._private.async_compat import is_async_func @@ -136,7 +137,7 @@ def __init__( decorator=None, hardref=False, ): - self._actor_ref = weakref.ref(actor) + self._actor_ref = weakref.proxy(actor) self._method_name = method_name self._num_returns = num_returns self._generator_backpressure_num_objects = generator_backpressure_num_objects @@ -161,6 +162,10 @@ def __call__(self, *args, **kwargs): f"'object.{self._method_name}.remote()'." ) + @DeveloperAPI + def bind(self, *args, **kwargs): + return self._bind(args, kwargs) + def remote(self, *args, **kwargs): return self._remote(args, kwargs) @@ -181,8 +186,43 @@ def options(self, **options): class FuncWrapper: def remote(self, *args, **kwargs): return func_cls._remote(args=args, kwargs=kwargs, **options) + + @DeveloperAPI + def bind(self, *args, **kwargs): + return func_cls._bind(args=args, kwargs=kwargs, **options) return FuncWrapper() + + @wrap_auto_init + @_tracing_actor_method_invocation + def _bind( + self, + args=None, + kwargs=None, + name="", + num_returns=None, + concurrency_group=None, + _generator_backpressure_num_objects=None, + ): + # TODO(sang): unify option passing + options = { + "name": name, + "num_returns": num_returns, + "concurrency_group": concurrency_group, + "_generator_backpressure_num_objects": _generator_backpressure_num_objects + } + other_args_to_resolve = { + PARENT_CLASS_NODE_KEY: self._actor_ref, + } + + node = ClassMethodNode( + self._method_name, + args, + kwargs, + options, + other_args_to_resolve=other_args_to_resolve, + ) + return node @wrap_auto_init @_tracing_actor_method_invocation @@ -203,7 +243,7 @@ def _remote( ) def invocation(args, kwargs): - actor = self._actor_hard_ref or self._actor_ref() + actor = self._actor_hard_ref or self._actor_ref if actor is None: raise RuntimeError("Lost reference to actor") return actor._actor_method_call( @@ -226,7 +266,7 @@ def invocation(args, kwargs): def __getstate__(self): return { - "actor": self._actor_ref(), + "actor": self._actor_ref, "method_name": self._method_name, "num_returns": self._num_returns, "decorator": self._decorator, diff --git a/python/ray/dag/__init__.py b/python/ray/dag/__init__.py index 985db41be0732..70c4a906393a4 100644 --- a/python/ray/dag/__init__.py +++ b/python/ray/dag/__init__.py @@ -9,7 +9,6 @@ from ray.dag.output_node import OutputNode from ray.dag.constants import ( PARENT_CLASS_NODE_KEY, - PREV_CLASS_METHOD_CALL_KEY, DAGNODE_TYPE_KEY, ) from ray.dag.vis_utils import plot @@ -23,7 +22,6 @@ "InputAttributeNode", "DAGInputData", "PARENT_CLASS_NODE_KEY", - "PREV_CLASS_METHOD_CALL_KEY", "DAGNODE_TYPE_KEY", "plot", "OutputNode", diff --git a/python/ray/dag/class_node.py b/python/ray/dag/class_node.py index 66eb83084d214..0365682a0eeab 100644 --- a/python/ray/dag/class_node.py +++ b/python/ray/dag/class_node.py @@ -1,14 +1,13 @@ +from weakref import ReferenceType + import ray from ray.dag.dag_node import DAGNode from ray.dag.input_node import InputNode from ray.dag.format_utils import get_dag_node_str -from ray.dag.constants import ( - PARENT_CLASS_NODE_KEY, - PREV_CLASS_METHOD_CALL_KEY, -) +from ray.dag.constants import PARENT_CLASS_NODE_KEY from ray.util.annotations import DeveloperAPI -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Union, Tuple @DeveloperAPI @@ -24,7 +23,6 @@ def __init__( other_args_to_resolve=None, ): self._body = cls - self._last_call: Optional["ClassMethodNode"] = None super().__init__( cls_args, cls_kwargs, @@ -85,7 +83,7 @@ def __getattr__(self, method_name: str): raise AttributeError(f".bind() cannot be used again on {type(self)} ") # Raise an error if the method is invalid. getattr(self._body, method_name) - call_node = _UnboundClassMethodNode(self, method_name) + call_node = _UnboundClassMethodNode(self, method_name, {}) return call_node def __str__(self) -> str: @@ -93,15 +91,17 @@ def __str__(self) -> str: class _UnboundClassMethodNode(object): - def __init__(self, actor: ClassNode, method_name: str): + def __init__(self, actor: ClassNode, method_name: str, options: dict): + # TODO(sang): Theoretically, We should use weakref cuz it is + # a circular dependency but when I used weakref, it fails + # because we cannot serialize the weakref. self._actor = actor self._method_name = method_name - self._options = {} + self._options = options def bind(self, *args, **kwargs): other_args_to_resolve = { PARENT_CLASS_NODE_KEY: self._actor, - PREV_CLASS_METHOD_CALL_KEY: self._actor._last_call, } node = ClassMethodNode( @@ -111,7 +111,6 @@ def bind(self, *args, **kwargs): self._options, other_args_to_resolve=other_args_to_resolve, ) - self._actor._last_call = node return node def __getattr__(self, attr: str): @@ -146,14 +145,10 @@ def __init__( self._bound_options = method_options or {} self._method_name: str = method_name # Parse other_args_to_resolve and assign to variables - self._parent_class_node: ClassNode = other_args_to_resolve.get( - PARENT_CLASS_NODE_KEY - ) - # Used to track lineage of ClassMethodCall to preserve deterministic - # submission and execution order. - self._prev_class_method_call: Optional[ - ClassMethodNode - ] = other_args_to_resolve.get(PREV_CLASS_METHOD_CALL_KEY, None) + self._parent_class_node: Union[ + ClassNode, + ReferenceType["ray._private.actor.ActorHandle"] + ] = other_args_to_resolve.get(PARENT_CLASS_NODE_KEY) # The actor creation task dependency is encoded as the first argument, # and the ordering dependency as the second, which ensures they are # executed prior to this node. diff --git a/python/ray/dag/constants.py b/python/ray/dag/constants.py index d2d309d56bdaa..77ccb6cc35b78 100644 --- a/python/ray/dag/constants.py +++ b/python/ray/dag/constants.py @@ -1,6 +1,5 @@ # Reserved keys used to handle ClassMethodNode in Ray DAG building. PARENT_CLASS_NODE_KEY = "parent_class_node" -PREV_CLASS_METHOD_CALL_KEY = "prev_class_method_call" # Reserved key to distinguish DAGNode type and avoid collision with user dict. DAGNODE_TYPE_KEY = "__dag_node_type__" diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py index 6c20db8c15124..cd52f8da07c73 100644 --- a/python/ray/dag/dag_node.py +++ b/python/ray/dag/dag_node.py @@ -105,7 +105,7 @@ def clear_cache(self): def execute( self, *args, _ray_cache_refs: bool = False, **kwargs - ) -> Union[ray.ObjectRef, ray.actor.ActorHandle]: + ) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]: """Execute this DAG using the Ray default executor _execute_impl(). Args: @@ -294,7 +294,7 @@ def apply_functional( return replaced_inputs - def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, ray.actor.ActorHandle]: + def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]: """Execute this node, assuming args have been transformed already.""" raise NotImplementedError diff --git a/python/ray/dag/output_node.py b/python/ray/dag/output_node.py index c98440a274d62..48f02371daef9 100644 --- a/python/ray/dag/output_node.py +++ b/python/ray/dag/output_node.py @@ -29,7 +29,7 @@ def __init__( other_args_to_resolve=other_args_to_resolve or {}, ) - def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, ray.actor.ActorHandle]: + def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]: if len(self._bound_args) == 1: return self._bound_args[0] else: diff --git a/python/ray/dag/tests/test_accelerator_dag.py b/python/ray/dag/tests/test_accelerator_dag.py index cf32dcd00852e..91d9c33a6e5b0 100644 --- a/python/ray/dag/tests/test_accelerator_dag.py +++ b/python/ray/dag/tests/test_accelerator_dag.py @@ -3,10 +3,7 @@ import ray from ray.dag.input_node import InputNode from ray.dag.output_node import OutputNode -from ray.dag import ( - PARENT_CLASS_NODE_KEY, - PREV_CLASS_METHOD_CALL_KEY, -) +from ray.dag import PARENT_CLASS_NODE_KEY from ray.dag.vis_utils import plot def test_output_node(shared_ray_instance): @@ -39,28 +36,42 @@ def f(input): assert ray.get(refs) == [1, 2, 1] -def test_a(shared_ray_instance): +def test_dag_with_actor_handle(shared_ray_instance): + """Verify DAG API works with actor created by .remote""" @ray.remote class Worker: def __init__(self): - pass + self.forward_called = 0 + self.init_called = 0 def forward(self, input): print("forward") + self.forward_called += 1 + return input def initialize(self, input): print("initialize") + self.init_called += 1 + return input + + def get(self): + return (self.forward_called, self.init_called) - worker = Worker.bind() + worker = Worker.remote() with InputNode() as input_node: - dag1 = worker.initialize.bind(input_node) + init_dag = worker.initialize.bind(input_node) with InputNode() as input_node: - dag2 = worker.forward.bind(input_node) + forward_dag = worker.forward.bind(input_node) + + assert ray.get(init_dag.execute(1)) == 1 + assert ray.get(forward_dag.execute(2)) == 2 - print(ray.get(dag2.execute(1))) + # Make sure both forward/initialize called only once + assert ray.get(worker.get.remote()) == (1, 1) - # plot(dag1, to_file="a.png") - # plot(dag2, to_file="b.png") + # Double check the actor is resued. + assert ray.get(init_dag.execute(1)) == 1 + assert ray.get(worker.get.remote()) == (1, 2) def test_tensor_parallel_dag(shared_ray_instance): @@ -68,32 +79,41 @@ def test_tensor_parallel_dag(shared_ray_instance): class Worker: def __init__(self, rank): self.rank = rank + self.forwarded = 0 def forward(self, input_data: int): print(input_data) + self.forwarded += 1 return self.rank + input_data def initialize(self): pass + def get_forwarded(self): + return self.forwarded + + NUM_WORKERS = 4 + workers = [Worker.remote(i) for i in range(NUM_WORKERS)] + # Init multiple times. + for _ in range(4): + ray.get([worker.initialize.remote() for worker in workers]) + with InputNode() as input_data: - workers = [Worker.bind(i) for i in range(4)] dag = OutputNode( [worker.forward.bind(input_data) for worker in workers]) - init_dag = OutputNode( - [worker.initialize.bind() for worker in workers]) - - # for _ in range(1): - # refs = dag.execute(2, _ray_cache_refs=True) - # assert len(refs) == 4 - # all_outputs = ray.get(refs) - # assert all_outputs == [2, 3, 4, 5] - - plot(init_dag, to_file="a.png") - plot(dag, to_file="b.png") - # ray.get(init_dag.execute(_ray_cache_refs=True)) - import time - time.sleep(30) + + # Run DAG repetitively. + ITER = 4 + assert ITER > 1 + for i in range(ITER): + ref = dag.execute(i) + all_outputs = ray.get(ref) + assert len(all_outputs) == NUM_WORKERS + assert all_outputs == [i + j for j in range(NUM_WORKERS)] + + forwarded = ray.get( + [worker.get_forwarded.remote() for worker in workers]) + assert forwarded == [ITER for _ in range(NUM_WORKERS)] if __name__ == "__main__": diff --git a/python/ray/dag/tests/test_class_dag.py b/python/ray/dag/tests/test_class_dag.py index 8bef8c792f9f9..f500d7774f02b 100644 --- a/python/ray/dag/tests/test_class_dag.py +++ b/python/ray/dag/tests/test_class_dag.py @@ -1,10 +1,7 @@ import pytest import ray -from ray.dag import ( - PARENT_CLASS_NODE_KEY, - PREV_CLASS_METHOD_CALL_KEY, -) +from ray.dag import PARENT_CLASS_NODE_KEY @ray.remote @@ -150,13 +147,6 @@ def combine(x, y): .get("name") == "a2_v0" ) - # refer to actor method a2.inc.options() call - assert ( - test_a2.get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY] - .get_options() - .get("name") - == "v3" - ) # refer to a1 constructor .options() call assert ( test_a1.get_other_args_to_resolve()[PARENT_CLASS_NODE_KEY] @@ -164,21 +154,6 @@ def combine(x, y): .get("name") == "a1_v1" ) - # refer to latest actor method a1.inc.options() call - assert ( - test_a1.get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY] - .get_options() - .get("name") - == "v2" - ) - # refer to first bound actor method a1.inc.options() call - assert ( - test_a1.get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY] - .get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY] - .get_options() - .get("name") - == "v1" - ) def test_pass_actor_handle(shared_ray_instance): diff --git a/python/ray/dag/utils.py b/python/ray/dag/utils.py index 3da8570027dbd..6bb59571b262a 100644 --- a/python/ray/dag/utils.py +++ b/python/ray/dag/utils.py @@ -23,9 +23,9 @@ def __init__(self): def get_node_name(self, node: DAGNode): # InputNode should be unique. if isinstance(node, InputNode): - return "OUTPUT_NODE" - if isinstance(node, OutputNode): return "INPUT_NODE" + if isinstance(node, OutputNode): + return "OUTPUT_NODE" # InputAttributeNode suffixes should match the user-defined key. elif isinstance(node, InputAttributeNode): return f"INPUT_ATTRIBUTE_NODE_{node._key}" From 664b07a4aefdb2a6e4b0822295a6592a8114e2bd Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Fri, 17 Nov 2023 23:35:42 +0900 Subject: [PATCH 03/66] enhancement --- a.py | 30 ----------------- python/ray/actor.py | 8 ++--- python/ray/dag/class_node.py | 5 ++- python/ray/dag/dag_node.py | 4 ++- python/ray/dag/input_node.py | 22 ++++++++++++- python/ray/dag/output_node.py | 18 +++++------ python/ray/dag/tests/test_accelerator_dag.py | 34 +++++++++++--------- python/ray/serve/tests/common/test_dags.py | 1 - 8 files changed, 57 insertions(+), 65 deletions(-) delete mode 100644 a.py diff --git a/a.py b/a.py deleted file mode 100644 index fe346dda46c8a..0000000000000 --- a/a.py +++ /dev/null @@ -1,30 +0,0 @@ -import ray -from ray.dag.vis_utils import plot -ray.init() - -from ray.dag.input_node import InputNode - -@ray.remote -def a(user_input): - return user_input * 2 - -@ray.remote -def b(user_input): - return user_input + 1 - -@ray.remote -def c(x, y): - return x + y - -with InputNode() as dag_input: - a_ref = a.bind(dag_input) - b_ref = b.bind(dag_input) - dag = c.bind(a_ref, b_ref) - -# a(2) + b(2) = c -# (2 * 2) + (2 * 1) -assert ray.get(dag.execute(2)) == 7 - -# a(3) + b(3) = c -# (3 * 2) + (3 * 1) -assert ray.get(dag.execute(3)) == 10 diff --git a/python/ray/actor.py b/python/ray/actor.py index 3019cbe47ca53..d2e2877349a0b 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -7,7 +7,6 @@ import ray._private.signature as signature import ray._private.worker import ray._raylet -from ray.dag.class_node import ClassMethodNode, PARENT_CLASS_NODE_KEY from ray import ActorClassID, Language, cross_language from ray._private import ray_option_utils from ray._private.async_compat import is_async_func @@ -30,6 +29,7 @@ StreamingObjectRefGenerator, raise_sys_exit_with_custom_error_message, ) +from ray.dag.class_node import PARENT_CLASS_NODE_KEY, ClassMethodNode from ray.exceptions import AsyncioActorExit from ray.util.annotations import DeveloperAPI, PublicAPI from ray.util.placement_group import _configure_placement_group_based_on_context @@ -186,13 +186,13 @@ def options(self, **options): class FuncWrapper: def remote(self, *args, **kwargs): return func_cls._remote(args=args, kwargs=kwargs, **options) - + @DeveloperAPI def bind(self, *args, **kwargs): return func_cls._bind(args=args, kwargs=kwargs, **options) return FuncWrapper() - + @wrap_auto_init @_tracing_actor_method_invocation def _bind( @@ -209,7 +209,7 @@ def _bind( "name": name, "num_returns": num_returns, "concurrency_group": concurrency_group, - "_generator_backpressure_num_objects": _generator_backpressure_num_objects + "_generator_backpressure_num_objects": _generator_backpressure_num_objects, } other_args_to_resolve = { PARENT_CLASS_NODE_KEY: self._actor_ref, diff --git a/python/ray/dag/class_node.py b/python/ray/dag/class_node.py index 0365682a0eeab..a474ffa10c553 100644 --- a/python/ray/dag/class_node.py +++ b/python/ray/dag/class_node.py @@ -93,7 +93,7 @@ def __str__(self) -> str: class _UnboundClassMethodNode(object): def __init__(self, actor: ClassNode, method_name: str, options: dict): # TODO(sang): Theoretically, We should use weakref cuz it is - # a circular dependency but when I used weakref, it fails + # a circular dependency but when I used weakref, it fails # because we cannot serialize the weakref. self._actor = actor self._method_name = method_name @@ -146,8 +146,7 @@ def __init__( self._method_name: str = method_name # Parse other_args_to_resolve and assign to variables self._parent_class_node: Union[ - ClassNode, - ReferenceType["ray._private.actor.ActorHandle"] + ClassNode, ReferenceType["ray._private.actor.ActorHandle"] ] = other_args_to_resolve.get(PARENT_CLASS_NODE_KEY) # The actor creation task dependency is encoded as the first argument, # and the ordering dependency as the second, which ensures they are diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py index cd52f8da07c73..6041a12401855 100644 --- a/python/ray/dag/dag_node.py +++ b/python/ray/dag/dag_node.py @@ -294,7 +294,9 @@ def apply_functional( return replaced_inputs - def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]: + def _execute_impl( + self, *args, **kwargs + ) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]: """Execute this node, assuming args have been transformed already.""" raise NotImplementedError diff --git a/python/ray/dag/input_node.py b/python/ray/dag/input_node.py index 3cffc6891c775..d80786a11c036 100644 --- a/python/ray/dag/input_node.py +++ b/python/ray/dag/input_node.py @@ -92,6 +92,8 @@ def __init__( """ if len(args) != 0 or len(kwargs) != 0: raise ValueError("InputNode should not take any args or kwargs.") + self._args = args + self._kwargs = kwargs self.input_attribute_nodes = {} @@ -103,6 +105,14 @@ def __init__( super().__init__([], {}, {}, other_args_to_resolve=_other_args_to_resolve) + @property + def args(self) -> List[Any]: + return self._args + + @property + def kwargs(self) -> Dict[Any, Any]: + return self._kwargs + def _copy_impl( self, new_args: List[Any], @@ -110,7 +120,9 @@ def _copy_impl( new_options: Dict[str, Any], new_other_args_to_resolve: Dict[str, Any], ): - return InputNode(_other_args_to_resolve=new_other_args_to_resolve) + return InputNode( + *new_args, _other_args_to_resolve=new_other_args_to_resolve, **new_kwargs + ) def _execute_impl(self, *args, **kwargs): """Executor of InputNode.""" @@ -321,6 +333,14 @@ def __init__(self, *args, **kwargs): self._args = list(args) self._kwargs = kwargs + @property + def args(self) -> List[Any]: + return self._args + + @property + def kwargs(self) -> Dict[Any, Any]: + return self._kwargs + def __getitem__(self, key: Union[int, str]) -> Any: if isinstance(key, int): # Access list args by index. diff --git a/python/ray/dag/output_node.py b/python/ray/dag/output_node.py index 48f02371daef9..c37e9b0de6954 100644 --- a/python/ray/dag/output_node.py +++ b/python/ray/dag/output_node.py @@ -10,16 +10,17 @@ class OutputNode(DAGNode): - r"""Ray dag node used in DAG building API to mark the endpoint of DAG - """ + r"""Ray dag node used in DAG building API to mark the endpoint of DAG""" def __init__( self, - args: Union[DAGNode, List[DAGNode], Tuple[DAGNode]], + args: Union[List[DAGNode], Tuple[DAGNode]], other_args_to_resolve: Dict[str, Any] = None, ): if isinstance(args, tuple): args = list(args) + if not isinstance(args, list): + raise ValueError(f"Invalid input type for `args`, {type(args)}.") if not isinstance(args, list): args = (args,) super().__init__( @@ -29,11 +30,10 @@ def __init__( other_args_to_resolve=other_args_to_resolve or {}, ) - def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]: - if len(self._bound_args) == 1: - return self._bound_args[0] - else: - return self._bound_args + def _execute_impl( + self, *args, **kwargs + ) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]: + return self._bound_args def _copy_impl( self, @@ -44,6 +44,6 @@ def _copy_impl( ) -> "DAGNode": """Return a copy of this node with the given new args.""" return OutputNode(new_args, new_other_args_to_resolve) - + def __str__(self) -> str: return get_dag_node_str(self, "__OutputNode__") diff --git a/python/ray/dag/tests/test_accelerator_dag.py b/python/ray/dag/tests/test_accelerator_dag.py index 91d9c33a6e5b0..5bad4e9781380 100644 --- a/python/ray/dag/tests/test_accelerator_dag.py +++ b/python/ray/dag/tests/test_accelerator_dag.py @@ -6,31 +6,34 @@ from ray.dag import PARENT_CLASS_NODE_KEY from ray.dag.vis_utils import plot + def test_output_node(shared_ray_instance): @ray.remote def f(input): return input + with pytest.raises(ValueError): + with InputNode() as input_data: + dag = OutputNode(f.bind(input_data)) + with InputNode() as input_data: - dag = OutputNode(f.bind(input_data)) - - assert ray.get(dag.execute(1)) == 1 - assert ray.get(dag.execute(2)) == 2 + dag = OutputNode([f.bind(input_data)]) + + assert ray.get(dag.execute(1)) == [1] + assert ray.get(dag.execute(2)) == [2] with InputNode() as input_data: dag = OutputNode([f.bind(input_data["x"]), f.bind(input_data["y"])]) - + refs = dag.execute({"x": 1, "y": 2}) assert len(refs) == 2 assert ray.get(refs) == [1, 2] with InputNode() as input_data: - dag = OutputNode([ - f.bind(input_data["x"]), - f.bind(input_data["y"]), - f.bind(input_data["x"]) - ]) - + dag = OutputNode( + [f.bind(input_data["x"]), f.bind(input_data["y"]), f.bind(input_data["x"])] + ) + refs = dag.execute({"x": 1, "y": 2}) assert len(refs) == 3 assert ray.get(refs) == [1, 2, 1] @@ -38,6 +41,7 @@ def f(input): def test_dag_with_actor_handle(shared_ray_instance): """Verify DAG API works with actor created by .remote""" + @ray.remote class Worker: def __init__(self): @@ -53,7 +57,7 @@ def initialize(self, input): print("initialize") self.init_called += 1 return input - + def get(self): return (self.forward_called, self.init_called) @@ -99,8 +103,7 @@ def get_forwarded(self): ray.get([worker.initialize.remote() for worker in workers]) with InputNode() as input_data: - dag = OutputNode( - [worker.forward.bind(input_data) for worker in workers]) + dag = OutputNode([worker.forward.bind(input_data) for worker in workers]) # Run DAG repetitively. ITER = 4 @@ -111,8 +114,7 @@ def get_forwarded(self): assert len(all_outputs) == NUM_WORKERS assert all_outputs == [i + j for j in range(NUM_WORKERS)] - forwarded = ray.get( - [worker.get_forwarded.remote() for worker in workers]) + forwarded = ray.get([worker.get_forwarded.remote() for worker in workers]) assert forwarded == [ITER for _ in range(NUM_WORKERS)] diff --git a/python/ray/serve/tests/common/test_dags.py b/python/ray/serve/tests/common/test_dags.py index 8bd7bcbc78912..ddccf41c5e3a9 100644 --- a/python/ray/serve/tests/common/test_dags.py +++ b/python/ray/serve/tests/common/test_dags.py @@ -60,4 +60,3 @@ def get_multi_instantiation_class_nested_deployment_arg_dag(): ray_dag = combine.__call__.bind(dag_input) return ray_dag, dag_input - From 8f6f8d276ae3eb8532c9dfa17f1afd6b9d7838a4 Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Fri, 17 Nov 2023 23:38:00 +0900 Subject: [PATCH 04/66] working now. --- SANGREADME.md | 65 -------------------- python/ray/dag/input_node.py | 22 +------ python/ray/dag/output_node.py | 2 - python/ray/dag/tests/test_accelerator_dag.py | 2 - 4 files changed, 1 insertion(+), 90 deletions(-) delete mode 100644 SANGREADME.md diff --git a/SANGREADME.md b/SANGREADME.md deleted file mode 100644 index 4f3a740a0ea72..0000000000000 --- a/SANGREADME.md +++ /dev/null @@ -1,65 +0,0 @@ -Actor.bind would kill actors unless I cache the refs. We should fix it. -When actor calls are binded with actor.method.bind, it doesn't create a new DAG, but it append binded methods to existing DAG. - -Worker -> method1 - -> method 2 - -Instead of 2 dags with - -method1 -method 2 - -Only 1 input node is possible with current DAG API. - -Serve: Got around the first issue because all actors are detached. -Not sure how it got around the second case. Maybe it never need to handle this case. - -Example: - -worker = Worker.bind() -dag = worker.method.bind() -dag2 = worker.method_2.bind() - -This will become - -worker -> method -> method2 - -not - -worker -> method -worker -> method_2 - - -VLLM - -init_worker -init torch distributed -init_model -profile_num_available_blocks -init_cache_engine - -forward - -Q: -- How much existing DAG will be used? Are we going to implement our own DAG APIs? (I believe so?) -- What's the work needed to make .remote work with actors? - - Is actor creation supposed to be a part of DAG? -- How the current shared memory based transport feature will be exposed to API? -- How do we handle different size input for different object ref? (the remaining bytes are just becoming garbages?) -- e2e flow - - InputNode creates the first buffer (object_ref) that could be reused. - - Each bind method reuses the buffer. - - If actor is reused. - - Use the first buffer created? We can only have 1 input node anyway now. -- Iterable DAG -> is it just a repeat of execute? - -TODO -- [done] Curerntly, any bind from actor will become a huge single DAG starting from actor. - - Need to find a way to exclude ClassNode from DAG execution. -- [done] Only one input node is possible for a single actor. But input node can have multiple inputs - - Maybe we should allow multiple input node for a single actor (and use it as a starting point). -- [done] No way to keep the actor alive. - - There's private argument _ray_cache_ref, but it will cache all refs which is not desirable. - - New API in the part of bind. - -1 DAG can only have 1 input Node diff --git a/python/ray/dag/input_node.py b/python/ray/dag/input_node.py index d80786a11c036..3cffc6891c775 100644 --- a/python/ray/dag/input_node.py +++ b/python/ray/dag/input_node.py @@ -92,8 +92,6 @@ def __init__( """ if len(args) != 0 or len(kwargs) != 0: raise ValueError("InputNode should not take any args or kwargs.") - self._args = args - self._kwargs = kwargs self.input_attribute_nodes = {} @@ -105,14 +103,6 @@ def __init__( super().__init__([], {}, {}, other_args_to_resolve=_other_args_to_resolve) - @property - def args(self) -> List[Any]: - return self._args - - @property - def kwargs(self) -> Dict[Any, Any]: - return self._kwargs - def _copy_impl( self, new_args: List[Any], @@ -120,9 +110,7 @@ def _copy_impl( new_options: Dict[str, Any], new_other_args_to_resolve: Dict[str, Any], ): - return InputNode( - *new_args, _other_args_to_resolve=new_other_args_to_resolve, **new_kwargs - ) + return InputNode(_other_args_to_resolve=new_other_args_to_resolve) def _execute_impl(self, *args, **kwargs): """Executor of InputNode.""" @@ -333,14 +321,6 @@ def __init__(self, *args, **kwargs): self._args = list(args) self._kwargs = kwargs - @property - def args(self) -> List[Any]: - return self._args - - @property - def kwargs(self) -> Dict[Any, Any]: - return self._kwargs - def __getitem__(self, key: Union[int, str]) -> Any: if isinstance(key, int): # Access list args by index. diff --git a/python/ray/dag/output_node.py b/python/ray/dag/output_node.py index c37e9b0de6954..d2749cbc1bb0f 100644 --- a/python/ray/dag/output_node.py +++ b/python/ray/dag/output_node.py @@ -3,8 +3,6 @@ from ray.dag import DAGNode from ray.dag.format_utils import get_dag_node_str -from ray.experimental.gradio_utils import type_to_string -from ray.util.annotations import Deprecated IN_CONTEXT_MANAGER = "__in_context_manager__" diff --git a/python/ray/dag/tests/test_accelerator_dag.py b/python/ray/dag/tests/test_accelerator_dag.py index 5bad4e9781380..7114a4f0f0ac7 100644 --- a/python/ray/dag/tests/test_accelerator_dag.py +++ b/python/ray/dag/tests/test_accelerator_dag.py @@ -3,8 +3,6 @@ import ray from ray.dag.input_node import InputNode from ray.dag.output_node import OutputNode -from ray.dag import PARENT_CLASS_NODE_KEY -from ray.dag.vis_utils import plot def test_output_node(shared_ray_instance): From 12b977dd923e4a0b2f16817631f200656e51849b Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 28 Nov 2023 22:08:43 -0800 Subject: [PATCH 05/66] initial commit Signed-off-by: Stephanie Wang --- BUILD.bazel | 4 + python/ray/__init__.py | 1 + python/ray/_private/worker.py | 22 +- python/ray/_raylet.pyx | 20 +- python/ray/includes/libcoreworker.pxd | 7 +- python/ray/tests/test_accelerated_dag.py | 27 +++ src/ray/core_worker/core_worker.cc | 17 +- src/ray/core_worker/core_worker.h | 8 +- .../store_provider/plasma_store_provider.cc | 22 +- .../store_provider/plasma_store_provider.h | 4 +- src/ray/object_manager/common.cc | 155 ++++++++++++++ src/ray/object_manager/common.h | 72 ++++++- src/ray/object_manager/plasma/client.cc | 194 +++++++++++++++--- src/ray/object_manager/plasma/client.h | 8 +- src/ray/object_manager/plasma/common.h | 11 +- src/ray/object_manager/plasma/object_store.cc | 8 + src/ray/object_manager/plasma/plasma.fbs | 2 + src/ray/object_manager/plasma/plasma.h | 3 + .../object_manager/plasma/plasma_allocator.cc | 2 +- src/ray/object_manager/plasma/protocol.cc | 4 + src/ray/object_manager/plasma/store.cc | 56 +++++ src/ray/object_manager/plasma/store.h | 2 + 22 files changed, 587 insertions(+), 62 deletions(-) create mode 100644 python/ray/tests/test_accelerated_dag.py create mode 100644 src/ray/object_manager/common.cc diff --git a/BUILD.bazel b/BUILD.bazel index 48eac971a76b0..1f8ff15b53798 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -321,6 +321,7 @@ PLASMA_LINKOPTS = [] + select({ ray_cc_library( name = "plasma_client", srcs = [ + "src/ray/object_manager/common.cc", "src/ray/object_manager/plasma/client.cc", "src/ray/object_manager/plasma/connection.cc", "src/ray/object_manager/plasma/malloc.cc", @@ -401,6 +402,9 @@ ray_cc_library( ":plasma_client", "//src/ray/common:network", ":stats_lib", + "@boost//:asio", + "@boost//:context", + "@boost//:coroutine", ], ) diff --git a/python/ray/__init__.py b/python/ray/__init__.py index e74749ab6e8fa..031f8054cf8e7 100644 --- a/python/ray/__init__.py +++ b/python/ray/__init__.py @@ -118,6 +118,7 @@ def _configure_system(): get, get_actor, get_gpu_ids, + release, init, is_initialized, put, diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py index 1d3324fa58fe8..94ab25be014b0 100644 --- a/python/ray/_private/worker.py +++ b/python/ray/_private/worker.py @@ -688,7 +688,7 @@ def set_mode(self, mode): def set_load_code_from_local(self, load_code_from_local): self._load_code_from_local = load_code_from_local - def put_object(self, value, object_ref=None, owner_address=None): + def put_object(self, value, object_ref=None, owner_address=None, max_readers=-1): """Put value in the local object store with object reference `object_ref`. This assumes that the value for `object_ref` has not yet been placed in @@ -744,7 +744,10 @@ def put_object(self, value, object_ref=None, owner_address=None): # reference counter. return ray.ObjectRef( self.core_worker.put_serialized_object_and_increment_local_ref( - serialized_value, object_ref=object_ref, owner_address=owner_address + serialized_value, + object_ref=object_ref, + owner_address=owner_address, + max_readers=max_readers, ), # The initial local reference is already acquired internally. skip_adding_local_ref=True, @@ -2489,6 +2492,12 @@ def show_in_dashboard(message: str, key: str = "", dtype: str = "text"): blocking_get_inside_async_warned = False +def release(object_ref): + worker = global_worker + worker.check_connected() + worker.core_worker.get_release([object_ref]) + + @overload def get( object_refs: "Sequence[ObjectRef[Any]]", *, timeout: Optional[float] = None @@ -2623,7 +2632,10 @@ def get( @PublicAPI @client_mode_hook def put( - value: Any, *, _owner: Optional["ray.actor.ActorHandle"] = None + value: Any, + *, + _owner: Optional["ray.actor.ActorHandle"] = None, + max_readers=-1, ) -> "ray.ObjectRef": """Store an object in the object store. @@ -2669,7 +2681,9 @@ def put( with profiling.profile("ray.put"): try: - object_ref = worker.put_object(value, owner_address=serialize_owner_address) + object_ref = worker.put_object( + value, owner_address=serialize_owner_address, max_readers=max_readers + ) except ObjectStoreFullError: logger.info( "Put failed since the value was either too large or the " diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 8e30b4bd9907e..e55a29dd08226 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -3336,6 +3336,13 @@ cdef class CoreWorker: return RayObjectsToDataMetadataPairs(results) + def get_release(self, object_refs): + cdef: + c_vector[CObjectID] c_object_ids = ObjectRefsToVector(object_refs) + with nogil: + op_status = CCoreWorkerProcess.GetCoreWorker().GetRelease(c_object_ids) + check_status(op_status) + def get_if_local(self, object_refs): """Get objects from local plasma store directly without a fetch request to raylet.""" @@ -3463,13 +3470,15 @@ cdef class CoreWorker: CCoreWorkerProcess.GetCoreWorker().SealExisting( c_object_id, pin_object=False, generator_id=CObjectID.Nil(), - owner_address=c_owner_address)) + owner_address=c_owner_address, + max_readers=-1)) def put_serialized_object_and_increment_local_ref(self, serialized_object, ObjectRef object_ref=None, c_bool pin_object=True, owner_address=None, - c_bool inline_small_object=True): + c_bool inline_small_object=True, + max_readers=-1): cdef: CObjectID c_object_id shared_ptr[CBuffer] data @@ -3477,6 +3486,7 @@ cdef class CoreWorker: unique_ptr[CAddress] c_owner_address c_vector[CObjectID] contained_object_ids c_vector[CObjectReference] contained_object_refs + int64_t c_max_readers = max_readers metadata = string_to_buffer(serialized_object.metadata) total_bytes = serialized_object.total_bytes @@ -3514,7 +3524,8 @@ cdef class CoreWorker: CCoreWorkerProcess.GetCoreWorker().SealOwned( c_object_id, pin_object, - move(c_owner_address))) + move(c_owner_address), + c_max_readers)) else: # Using custom object refs is not supported because we # can't track their lifecycle, so we don't pin the @@ -3523,7 +3534,8 @@ cdef class CoreWorker: CCoreWorkerProcess.GetCoreWorker().SealExisting( c_object_id, pin_object=False, generator_id=CObjectID.Nil(), - owner_address=move(c_owner_address))) + owner_address=move(c_owner_address), + max_readers=c_max_readers)) return c_object_id.Binary() diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index 0f9d158cca352..28fa6375212bb 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -240,10 +240,13 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: shared_ptr[CBuffer] *data, c_bool created_by_worker) CRayStatus SealOwned(const CObjectID &object_id, c_bool pin_object, - const unique_ptr[CAddress] &owner_address) + const unique_ptr[CAddress] &owner_address, + int64_t max_readers) CRayStatus SealExisting(const CObjectID &object_id, c_bool pin_object, const CObjectID &generator_id, - const unique_ptr[CAddress] &owner_address) + const unique_ptr[CAddress] &owner_address, + int64_t max_readers) + CRayStatus GetRelease(const c_vector[CObjectID] &object_ids) CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms, c_vector[shared_ptr[CRayObject]] *results) CRayStatus GetIfLocal( diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py new file mode 100644 index 0000000000000..8f6286a3c5351 --- /dev/null +++ b/python/ray/tests/test_accelerated_dag.py @@ -0,0 +1,27 @@ +# coding: utf-8 +import logging +import os +import sys + +import pytest + +import ray +import ray.cluster_utils + +logger = logging.getLogger(__name__) + + +def test_put_mutable_object(ray_start_cluster): + # ref = ray.create_mutable_object(size_bytes=1000) + + max_readers = 1 + arr = b"binary" + ref = ray.put(arr, max_readers=max_readers) + ray.release(ref) + + +if __name__ == "__main__": + if os.environ.get("PARALLEL_CI"): + sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__])) + else: + sys.exit(pytest.main(["-sv", __file__])) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 5042ce3dc164f..586b69714eb35 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1326,9 +1326,10 @@ Status CoreWorker::CreateExisting(const std::shared_ptr &metadata, Status CoreWorker::SealOwned(const ObjectID &object_id, bool pin_object, - const std::unique_ptr &owner_address) { - auto status = - SealExisting(object_id, pin_object, ObjectID::Nil(), std::move(owner_address)); + const std::unique_ptr &owner_address, + int64_t max_readers) { + auto status = SealExisting( + object_id, pin_object, ObjectID::Nil(), std::move(owner_address), max_readers); if (status.ok()) return status; RemoveLocalReference(object_id); if (reference_counter_->HasReference(object_id)) { @@ -1342,8 +1343,9 @@ Status CoreWorker::SealOwned(const ObjectID &object_id, Status CoreWorker::SealExisting(const ObjectID &object_id, bool pin_object, const ObjectID &generator_id, - const std::unique_ptr &owner_address) { - RAY_RETURN_NOT_OK(plasma_store_provider_->Seal(object_id)); + const std::unique_ptr &owner_address, + int64_t max_readers) { + RAY_RETURN_NOT_OK(plasma_store_provider_->Seal(object_id, max_readers)); if (pin_object) { // Tell the raylet to pin the object **after** it is created. RAY_LOG(DEBUG) << "Pinning sealed object " << object_id; @@ -1367,6 +1369,11 @@ Status CoreWorker::SealExisting(const ObjectID &object_id, return Status::OK(); } +Status CoreWorker::GetRelease(const std::vector &object_ids) { + RAY_CHECK(object_ids.size() == 1); + return plasma_store_provider_->GetRelease(object_ids[0]); +} + Status CoreWorker::Get(const std::vector &ids, const int64_t timeout_ms, std::vector> *results) { diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index e1668b61b6f51..d4f621acd6380 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -656,7 +656,8 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// \return Status. Status SealOwned(const ObjectID &object_id, bool pin_object, - const std::unique_ptr &owner_address = nullptr); + const std::unique_ptr &owner_address = nullptr, + int64_t max_readers = -1); /// Finalize placing an object into the object store. This should be called after /// a corresponding `CreateExisting()` call and then writing into the returned buffer. @@ -673,7 +674,10 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { Status SealExisting(const ObjectID &object_id, bool pin_object, const ObjectID &generator_id = ObjectID::Nil(), - const std::unique_ptr &owner_address = nullptr); + const std::unique_ptr &owner_address = nullptr, + int64_t max_readers = -1); + + Status GetRelease(const std::vector &object_ids); /// Get a list of objects from the object store. Objects that failed to be retrieved /// will be returned as nullptrs. diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.cc b/src/ray/core_worker/store_provider/plasma_store_provider.cc index 827678edc0f85..eb440a668a9dd 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.cc +++ b/src/ray/core_worker/store_provider/plasma_store_provider.cc @@ -153,8 +153,9 @@ Status CoreWorkerPlasmaStoreProvider::Create(const std::shared_ptr &meta return status; } -Status CoreWorkerPlasmaStoreProvider::Seal(const ObjectID &object_id) { - return store_client_.Seal(object_id); +Status CoreWorkerPlasmaStoreProvider::Seal(const ObjectID &object_id, + int64_t max_readers) { + return store_client_.Seal(object_id, max_readers); } Status CoreWorkerPlasmaStoreProvider::Release(const ObjectID &object_id) { @@ -171,12 +172,13 @@ Status CoreWorkerPlasmaStoreProvider::FetchAndGetFromPlasmaStore( absl::flat_hash_map> *results, bool *got_exception) { const auto owner_addresses = reference_counter_->GetOwnerAddresses(batch_ids); - RAY_RETURN_NOT_OK( - raylet_client_->FetchOrReconstruct(batch_ids, - owner_addresses, - fetch_only, - /*mark_worker_blocked*/ !in_direct_call, - task_id)); + // TODO this IPC needs to be skipped in shared mode + // RAY_RETURN_NOT_OK( + // raylet_client_->FetchOrReconstruct(batch_ids, + // owner_addresses, + // fetch_only, + // /*mark_worker_blocked*/ !in_direct_call, + // task_id)); std::vector plasma_results; RAY_RETURN_NOT_OK(store_client_.Get(batch_ids, @@ -215,6 +217,10 @@ Status CoreWorkerPlasmaStoreProvider::FetchAndGetFromPlasmaStore( return Status::OK(); } +Status CoreWorkerPlasmaStoreProvider::GetRelease(const ObjectID &object_id) { + return store_client_.GetRelease(object_id); +} + Status CoreWorkerPlasmaStoreProvider::GetIfLocal( const std::vector &object_ids, absl::flat_hash_map> *results) { diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.h b/src/ray/core_worker/store_provider/plasma_store_provider.h index 2e08309c6cc88..523aa86a3e5f0 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.h +++ b/src/ray/core_worker/store_provider/plasma_store_provider.h @@ -135,7 +135,7 @@ class CoreWorkerPlasmaStoreProvider { /// /// \param[in] object_id The ID of the object. This can be used as an /// argument to Get to retrieve the object data. - Status Seal(const ObjectID &object_id); + Status Seal(const ObjectID &object_id, int64_t max_readers = -1); /// Release the first reference to the object created by Put() or Create(). This should /// be called exactly once per object and until it is called, the object is pinned and @@ -151,6 +151,8 @@ class CoreWorkerPlasmaStoreProvider { absl::flat_hash_map> *results, bool *got_exception); + Status GetRelease(const ObjectID &object_id); + /// Get objects directly from the local plasma store, without waiting for the /// objects to be fetched from another node. This should only be used /// internally, never by user code. diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc new file mode 100644 index 0000000000000..4eff0d3e583b4 --- /dev/null +++ b/src/ray/object_manager/common.cc @@ -0,0 +1,155 @@ +#include "ray/object_manager/common.h" + +namespace ray { + +void PrintPlasmaObjectHeader(const PlasmaObjectHeader *header) { + RAY_LOG(DEBUG) << "PlasmaObjectHeader: \n" + << "version: " << header->version << "\n" + << "num_readers: " << header->num_readers << "\n" + << "num_read_acquires_remaining: " << header->num_read_acquires_remaining + << "\n" + << "num_read_releases_remaining: " << header->num_read_releases_remaining + << "\n" + << "data_size: " << header->data_size << "\n"; +} + +void PlasmaObjectHeader::Init() { + // wr_mut is shared between writer and readers. + pthread_mutexattr_t mutex_attr; + pthread_mutexattr_init(&mutex_attr); + pthread_mutexattr_setpshared(&mutex_attr, PTHREAD_PROCESS_SHARED); + pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_ERRORCHECK); + pthread_mutex_init(&wr_mut, &mutex_attr); + + sem_init(&rw_semaphore, PTHREAD_PROCESS_SHARED, 1); + + // Condition is shared between writer and readers. + pthread_condattr_t cond_attr; + pthread_condattr_init(&cond_attr); + pthread_condattr_setpshared(&cond_attr, PTHREAD_PROCESS_SHARED); + pthread_cond_init(&cond, &cond_attr); +} + +void PlasmaObjectHeader::Destroy() { + RAY_CHECK(pthread_mutex_destroy(&wr_mut) == 0); + RAY_CHECK(pthread_cond_destroy(&cond) == 0); + RAY_CHECK(sem_destroy(&rw_semaphore) == 0); +} + +// Get the data size of the plasma object. +// This has to be called only when reader lock is acquired +// via ReadAcquire. +uint64_t PlasmaObjectHeader::GetDataSize() const { + RAY_CHECK_GE(num_read_releases_remaining, 0) + << "ReadAcquire has to be called before calling this method."; + return data_size; +} + +void PlasmaObjectHeader::WriteAcquire(int64_t write_version, uint64_t new_size) { + RAY_LOG(DEBUG) << "WriteAcquire. version: " << write_version; + sem_wait(&rw_semaphore); + RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0); + PrintPlasmaObjectHeader(this); + + RAY_CHECK(num_read_acquires_remaining == 0); + RAY_CHECK(num_read_releases_remaining == 0); + RAY_CHECK(write_version == version + 1) + << "Write version " << write_version + << " is more than 1 greater than current version " << version + << ". Are you sure this is the only writer?"; + + num_readers = 0; + version = write_version; + data_size = new_size; + + RAY_LOG(DEBUG) << "WriteAcquire done"; + PrintPlasmaObjectHeader(this); + RAY_CHECK(pthread_mutex_unlock(&wr_mut) == 0); +} + +void PlasmaObjectHeader::WriteRelease(int64_t write_version, int64_t write_num_readers) { + RAY_LOG(DEBUG) << "WriteRelease Waiting. version: " << write_version + << " max readers: " << write_num_readers; + RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0); + RAY_LOG(DEBUG) << "WriteRelease " << write_version + << " max readers: " << write_num_readers; + PrintPlasmaObjectHeader(this); + + RAY_CHECK(version == write_version) + << "Write version " << write_version << " no longer matches current version " + << version << ". Are you sure this is the only writer?"; + + version = write_version; + num_readers = write_num_readers; + num_read_acquires_remaining = num_readers; + num_read_releases_remaining = num_readers; + + RAY_LOG(DEBUG) << "WriteRelease done"; + PrintPlasmaObjectHeader(this); + RAY_CHECK(pthread_mutex_unlock(&wr_mut) == 0); + // Signal to all readers. + RAY_CHECK(pthread_cond_broadcast(&cond) == 0); +} + +int64_t PlasmaObjectHeader::ReadAcquire(int64_t read_version) { + RAY_LOG(DEBUG) << "ReadAcquire Waiting" << read_version; + RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0); + RAY_LOG(DEBUG) << "ReadAcquire " << read_version; + PrintPlasmaObjectHeader(this); + + while (version < read_version || num_read_acquires_remaining == 0) { + RAY_CHECK(pthread_cond_wait(&cond, &wr_mut) == 0); + } + + if (version > read_version) { + RAY_LOG(WARNING) << "Version " << version << " already exceeds version to read " + << read_version << ". May have missed earlier reads."; + } + + if (num_readers != -1) { + num_read_acquires_remaining--; + RAY_CHECK(num_read_acquires_remaining >= 0) + << "readers acquired exceeds max readers " << num_readers; + // This object can only be read a constant number of times. Tell the caller + // which version was read. + read_version = version; + } else { + read_version = 0; + } + + RAY_LOG(DEBUG) << "ReadAcquire done"; + PrintPlasmaObjectHeader(this); + + RAY_CHECK(pthread_mutex_unlock(&wr_mut) == 0); + // Signal to other readers that they may read. + RAY_CHECK(pthread_cond_signal(&cond) == 0); + return read_version; +} + +void PlasmaObjectHeader::ReadRelease(int64_t read_version) { + bool all_readers_done = false; + RAY_LOG(DEBUG) << "ReadRelease Waiting" << read_version; + RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0); + PrintPlasmaObjectHeader(this); + + RAY_LOG(DEBUG) << "ReadRelease " << read_version << " version is currently " << version; + RAY_CHECK(version == read_version) << "Version " << version << " modified from version " + << read_version << " at read start"; + + if (num_readers != -1) { + num_read_releases_remaining--; + RAY_CHECK(num_read_releases_remaining >= 0); + if (num_read_releases_remaining == 0) { + all_readers_done = true; + } + } + + PrintPlasmaObjectHeader(this); + RAY_LOG(DEBUG) << "ReadRelease done"; + RAY_CHECK(pthread_mutex_unlock(&wr_mut) == 0); + if (all_readers_done) { + sem_post(&rw_semaphore); + } +} + +} // namespace ray diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h index 66829d2511ebc..23634cbae7d35 100644 --- a/src/ray/object_manager/common.h +++ b/src/ray/object_manager/common.h @@ -14,6 +14,9 @@ #pragma once +#include + +#include #include #include @@ -36,9 +39,74 @@ using RestoreSpilledObjectCallback = const std::string &, std::function)>; +struct PlasmaObjectHeader { + // Used to signal to the writer when all readers are done. + sem_t rw_semaphore; + + // Protects all following state, used to signal from writer to readers. + pthread_mutex_t wr_mut; + // Used to signal to readers when the writer is done writing a new version. + pthread_cond_t cond; + // The object version. For immutable objects, this gets incremented to 1 on + // the first write and then should never be modified. For mutable objects, + // each new write must increment the version before releasing to readers. + int64_t version = 0; + // The total number of reads allowed before the writer can write again. This + // value should be set by the writer before releasing to readers. + // For immutable objects, this is set to -1 and infinite reads are allowed. + // Otherwise, readers must acquire/release before/after reading. + int64_t num_readers = 0; + // The number of readers who can acquire the current version. For mutable + // objects, readers must ensure this is > 0 and decrement before they read. + // Once this value reaches 0, no more readers are allowed until the writer + // writes a new version. + int64_t num_read_acquires_remaining = 0; + // The number of readers who must release the current version before a new + // version can be written. For mutable objects, readers must decrement this + // when they are done reading the current version. Once this value reaches 0, + // the reader should signal to the writer that they can write again. + int64_t num_read_releases_remaining = 0; + // The valid data and metadata size of the Ray object. + // Not used for immutable objects. + // For mutable objects, this should be modified when the new object has a + // different data/metadata size. + uint64_t data_size = 0; + uint64_t metadata_size = 0; + + void Init(); + + void Destroy(); + + // Blocks until there are no more readers. + // NOTE: Caller should ensure there is one writer at a time. + /// \param write_version The new version for write. + /// \param new_size The new data size of the object. + void WriteAcquire(int64_t write_version, uint64_t new_data_size); + + // Call after completing a write to signal to num_readers many readers. + void WriteRelease(int64_t write_version, int64_t num_readers); + + // Blocks until the given version or a more recent version is ready to read. + // + // \param read_version The minimum version to wait for. + // \return The version that was read. This should be passed to ReadRelease + // when the reader is done. + int64_t ReadAcquire(int64_t read_version); + + // Finishes the read. If all reads are done, signals to the + // writer. This is not necessary to call for objects that have + // num_readers=-1. + void ReadRelease(int64_t read_version); + + // Get the data size of the plasma object. + // The reader must first ReadAcquire. + uint64_t GetDataSize() const; +}; + /// A struct that includes info about the object. struct ObjectInfo { ObjectID object_id; + bool is_mutable; int64_t data_size = 0; int64_t metadata_size = 0; /// Owner's raylet ID. @@ -50,7 +118,9 @@ struct ObjectInfo { /// Owner's worker ID. WorkerID owner_worker_id; - int64_t GetObjectSize() const { return data_size + metadata_size; } + int64_t GetObjectSize() const { + return sizeof(PlasmaObjectHeader) + data_size + metadata_size; + } bool operator==(const ObjectInfo &other) const { return ((object_id == other.object_id) && (data_size == other.data_size) && diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index e3274a058df1c..31a507cb3dc92 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -32,6 +32,7 @@ #include "absl/container/flat_hash_map.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/ray_config.h" +#include "ray/object_manager/common.h" #include "ray/object_manager/plasma/connection.h" #include "ray/object_manager/plasma/plasma.h" #include "ray/object_manager/plasma/protocol.h" @@ -94,6 +95,12 @@ struct ObjectInUseEntry { PlasmaObject object; /// A flag representing whether the object has been sealed. bool is_sealed; + bool is_shared = false; + /// For shared objects only. + /// The last version that we read or wrote. To read or write again, we must + /// pass a newer version than this. + int64_t next_version_to_read = 1; + int64_t next_version_to_write = 1; }; class PlasmaClient::Impl : public std::enable_shared_from_this { @@ -145,13 +152,15 @@ class PlasmaClient::Impl : public std::enable_shared_from_this &object_ids); @@ -195,10 +204,12 @@ class PlasmaClient::Impl : public std::enable_shared_from_thissecond->pointer(); } +ray::PlasmaObjectHeader *PlasmaClient::Impl::GetPlasmaObjectHeader( + const PlasmaObject &object) const { + auto base_ptr = LookupMmappedFile(object.store_fd); + auto header_ptr = base_ptr + object.header_offset; + return reinterpret_cast(header_ptr); +} + bool PlasmaClient::Impl::IsInUse(const ObjectID &object_id) { std::lock_guard guard(client_mutex_); @@ -271,13 +289,14 @@ bool PlasmaClient::Impl::IsInUse(const ObjectID &object_id) { } void PlasmaClient::Impl::IncrementObjectCount(const ObjectID &object_id, - PlasmaObject *object, + const PlasmaObject *object, bool is_sealed) { // Increment the count of the object to track the fact that it is being used. // The corresponding decrement should happen in PlasmaClient::Release. auto elem = objects_in_use_.find(object_id); ObjectInUseEntry *object_entry; if (elem == objects_in_use_.end()) { + RAY_CHECK(object != nullptr); // Add this object ID to the hash table of object IDs in use. The // corresponding call to free happens in PlasmaClient::Release. objects_in_use_[object_id] = std::make_unique(); @@ -287,7 +306,8 @@ void PlasmaClient::Impl::IncrementObjectCount(const ObjectID &object_id, object_entry = objects_in_use_[object_id].get(); } else { object_entry = elem->second.get(); - RAY_CHECK(object_entry->count > 0); + // TODO(swang): Nicer way to pin shared objects. + // RAY_CHECK(object_entry->count > 0); } // Increment the count of the number of instances of this object that are // being used by this client. The corresponding decrement should happen in @@ -368,6 +388,44 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, fb::ObjectSource source, int device_num) { std::unique_lock guard(client_mutex_); + auto object_entry = objects_in_use_.find(object_id); + if (object_entry != objects_in_use_.end()) { + auto &entry = object_entry->second; + if (entry->is_sealed && entry->is_shared) { + RAY_LOG(DEBUG) << "Create shared object " << object_id << " exists"; + // Wait for no readers. + auto plasma_header = GetPlasmaObjectHeader(entry->object); + // TODO(sang) + // NOTE: entry->object.data_size is the size of the data buffer. + // When the object is shared, we can have object size smaller than the data buffer. + RAY_LOG(DEBUG) << "SANG-TODO Update the data size of " << object_id + << ". Size: " << data_size; + auto next_version_to_write = plasma_header->version + 1; + plasma_header->WriteAcquire(next_version_to_write, data_size); + + // Prepare the data buffer and return to the client instead of sending + // the IPC to object store. + *data = std::make_shared( + shared_from_this(), + GetStoreFdAndMmap(entry->object.store_fd, entry->object.mmap_size) + + entry->object.data_offset, + entry->object.data_size); + // If plasma_create is being called from a transfer, then we will not copy the + // metadata here. The metadata will be written along with the data streamed + // from the transfer. + if (metadata != NULL) { + // Copy the metadata to the buffer. + memcpy((*data)->Data() + entry->object.data_size, + metadata, + entry->object.metadata_size); + } + + entry->is_sealed = false; + IncrementObjectCount(object_id, &entry->object, false); + } + return Status::OK(); + } + uint64_t retry_with_request_id = 0; RAY_LOG(DEBUG) << "called plasma_create on conn " << store_conn_ << " with size " @@ -394,6 +452,20 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, object_id, retry_with_request_id, metadata, &retry_with_request_id, data); } + if (status.ok()) { + // Create IPC was successful. + object_entry = objects_in_use_.find(object_id); + RAY_CHECK(object_entry != objects_in_use_.end()); + auto &entry = object_entry->second; + RAY_CHECK(!entry->is_sealed); + auto plasma_header = GetPlasmaObjectHeader(entry->object); + // The corresponding WriteRelease takes place in Seal. + // When an object is first created, the data size is equivalent to + // buffer size. + // The first creation's version is always 1. + plasma_header->WriteAcquire(/*next_version_to_write*/ 1, entry->object.data_size); + } + return status; } @@ -457,8 +529,19 @@ Status PlasmaClient::Impl::GetBuffers( all_present = false; } else { PlasmaObject *object = &object_entry->second->object; - std::shared_ptr physical_buf; + // Wait for the object to become ready to read. + auto plasma_header = GetPlasmaObjectHeader(*object); + int64_t version_read = + plasma_header->ReadAcquire(object_entry->second->next_version_to_read); + auto data_size = plasma_header->GetDataSize(); + RAY_LOG(DEBUG) << "SANG-TODO data size is " << data_size; + if (version_read > 0) { + object_entry->second->is_shared = true; + object_entry->second->next_version_to_read = version_read; + } + + std::shared_ptr physical_buf; if (object->device_num == 0) { uint8_t *data = LookupMmappedFile(object->store_fd); physical_buf = std::make_shared( @@ -467,8 +550,7 @@ Status PlasmaClient::Impl::GetBuffers( RAY_LOG(FATAL) << "GPU library is not enabled."; } physical_buf = wrap_buffer(object_ids[i], physical_buf); - object_buffers[i].data = - SharedMemoryBuffer::Slice(physical_buf, 0, object->data_size); + object_buffers[i].data = SharedMemoryBuffer::Slice(physical_buf, 0, data_size); object_buffers[i].metadata = SharedMemoryBuffer::Slice( physical_buf, object->data_size, object->metadata_size); object_buffers[i].device_num = object->device_num; @@ -525,6 +607,19 @@ Status PlasmaClient::Impl::GetBuffers( // If we are here, the object was not currently in use, so we need to // process the reply from the object store. if (object->data_size != -1) { + // Increment the count of the number of instances of this object that this + // client is using. Cache the reference to the object. + IncrementObjectCount(received_object_ids[i], object, true); + auto &object_entry = objects_in_use_[received_object_ids[i]]; + // Wait for the object to become ready to read. + auto plasma_header = GetPlasmaObjectHeader(*object); + int64_t version_read = plasma_header->ReadAcquire(/*version=*/1); + auto data_size = plasma_header->GetDataSize(); + if (version_read > 0) { + object_entry->is_shared = true; + object_entry->next_version_to_read = version_read; + } + std::shared_ptr physical_buf; if (object->device_num == 0) { uint8_t *data = LookupMmappedFile(object->store_fd); @@ -535,14 +630,10 @@ Status PlasmaClient::Impl::GetBuffers( } // Finish filling out the return values. physical_buf = wrap_buffer(object_ids[i], physical_buf); - object_buffers[i].data = - SharedMemoryBuffer::Slice(physical_buf, 0, object->data_size); + object_buffers[i].data = SharedMemoryBuffer::Slice(physical_buf, 0, data_size); object_buffers[i].metadata = SharedMemoryBuffer::Slice( physical_buf, object->data_size, object->metadata_size); object_buffers[i].device_num = object->device_num; - // Increment the count of the number of instances of this object that this - // client is using. Cache the reference to the object. - IncrementObjectCount(received_object_ids[i], object, true); } else { // The object was not retrieved. The caller can detect this condition // by checking the boolean value of the metadata/data buffers. @@ -569,6 +660,29 @@ Status PlasmaClient::Impl::Get(const std::vector &object_ids, &object_ids[0], num_objects, timeout_ms, wrap_buffer, &(*out)[0], is_from_worker); } +Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) { + RAY_LOG(DEBUG) << "Try to release Get for object " << object_id; + std::unique_lock guard(client_mutex_); + auto object_entry = objects_in_use_.find(object_id); + if (object_entry == objects_in_use_.end()) { + return Status::ObjectNotFound( + "ray.release() called on an object that is not in scope"); + } + + auto &entry = object_entry->second; + // RAY_CHECK(entry->is_sealed && entry->is_shared) << "ray.release must be called on " + // "objects that are sealed and shared. sealed? " << entry->is_sealed + // << " shared " << entry->is_shared; + + RAY_LOG(DEBUG) << "Release shared object " << object_id; + auto plasma_header = GetPlasmaObjectHeader(entry->object); + plasma_header->ReadRelease(entry->next_version_to_read); + // The next read needs to read at least this version. + entry->next_version_to_read++; + + return Status::OK(); +} + Status PlasmaClient::Impl::MarkObjectUnused(const ObjectID &object_id) { auto object_entry = objects_in_use_.find(object_id); RAY_CHECK(object_entry != objects_in_use_.end()); @@ -592,7 +706,8 @@ Status PlasmaClient::Impl::Release(const ObjectID &object_id) { object_entry->second->count -= 1; RAY_CHECK(object_entry->second->count >= 0); // Check if the client is no longer using this object. - if (object_entry->second->count == 0) { + // TODO(swang): Nicer way to pin shared objects. + if (object_entry->second->count == 0 && !object_entry->second->is_shared) { // object_entry is invalidated in MarkObjectUnused, need to read the fd beforehand. MEMFD_TYPE fd = object_entry->second->object.store_fd; // Tell the store that the client no longer needs the object. @@ -648,7 +763,7 @@ Status PlasmaClient::Impl::Contains(const ObjectID &object_id, bool *has_object) return Status::OK(); } -Status PlasmaClient::Impl::Seal(const ObjectID &object_id) { +Status PlasmaClient::Impl::Seal(const ObjectID &object_id, int64_t num_readers) { std::lock_guard guard(client_mutex_); // Make sure this client has a reference to the object before sending the @@ -662,20 +777,33 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) { return Status::ObjectAlreadySealed("Seal() called on an already sealed object"); } + auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object); + // The value should've already updated when object is created. + auto next_version_to_write = plasma_header->version; + plasma_header->WriteRelease( + /*write_version=*/next_version_to_write, num_readers); + object_entry->second->next_version_to_write = next_version_to_write; + + if (num_readers != -1) { + object_entry->second->is_shared = true; + } object_entry->second->is_sealed = true; - /// Send the seal request to Plasma. - RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id)); - std::vector buffer; - RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer)); - ObjectID sealed_id; - RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id)); - RAY_CHECK(sealed_id == object_id); - // We call PlasmaClient::Release to decrement the number of instances of this - // object - // that are currently being used by this client. The corresponding increment - // happened in plasma_create and was used to ensure that the object was not - // released before the call to PlasmaClient::Seal. - return Release(object_id); + //// Send the seal request to Plasma. + // RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id)); + // std::vector buffer; + // RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer)); + // ObjectID sealed_id; + // RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id)); + // RAY_CHECK(sealed_id == object_id); + //// We call PlasmaClient::Release to decrement the number of instances of this + //// object + //// that are currently being used by this client. The corresponding increment + //// happened in plasma_create and was used to ensure that the object was not + //// released before the call to PlasmaClient::Seal. + // return Release(object_id); + + // TODO(swang): Release the object if the ref count == 0. + return Status::OK(); } Status PlasmaClient::Impl::Abort(const ObjectID &object_id) { @@ -847,6 +975,10 @@ Status PlasmaClient::Get(const std::vector &object_ids, return impl_->Get(object_ids, timeout_ms, object_buffers, is_from_worker); } +Status PlasmaClient::GetRelease(const ObjectID &object_id) { + return impl_->GetRelease(object_id); +} + Status PlasmaClient::Release(const ObjectID &object_id) { return impl_->Release(object_id); } @@ -857,7 +989,9 @@ Status PlasmaClient::Contains(const ObjectID &object_id, bool *has_object) { Status PlasmaClient::Abort(const ObjectID &object_id) { return impl_->Abort(object_id); } -Status PlasmaClient::Seal(const ObjectID &object_id) { return impl_->Seal(object_id); } +Status PlasmaClient::Seal(const ObjectID &object_id, int64_t num_readers) { + return impl_->Seal(object_id, num_readers); +} Status PlasmaClient::Delete(const ObjectID &object_id) { return impl_->Delete(std::vector{object_id}); diff --git a/src/ray/object_manager/plasma/client.h b/src/ray/object_manager/plasma/client.h index d466528ecd275..74841df373fee 100644 --- a/src/ray/object_manager/plasma/client.h +++ b/src/ray/object_manager/plasma/client.h @@ -82,13 +82,15 @@ class PlasmaClientInterface { std::vector *object_buffers, bool is_from_worker) = 0; + virtual Status GetRelease(const ObjectID &object_id) = 0; + /// Seal an object in the object store. The object will be immutable after /// this /// call. /// /// \param object_id The ID of the object to seal. /// \return The return status. - virtual Status Seal(const ObjectID &object_id) = 0; + virtual Status Seal(const ObjectID &object_id, int64_t num_readers = -1) = 0; /// Abort an unsealed object in the object store. If the abort succeeds, then /// it will be as if the object was never created at all. The unsealed object @@ -255,6 +257,8 @@ class PlasmaClient : public PlasmaClientInterface { std::vector *object_buffers, bool is_from_worker); + Status GetRelease(const ObjectID &object_id); + /// Tell Plasma that the client no longer needs the object. This should be /// called after Get() or Create() when the client is done with the object. /// After this call, the buffer returned by Get() is no longer valid. @@ -290,7 +294,7 @@ class PlasmaClient : public PlasmaClientInterface { /// /// \param object_id The ID of the object to seal. /// \return The return status. - Status Seal(const ObjectID &object_id); + Status Seal(const ObjectID &object_id, int64_t num_readers = -1); /// Delete an object from the object store. This currently assumes that the /// object is present, has been sealed and not used by another client. Otherwise, diff --git a/src/ray/object_manager/plasma/common.h b/src/ray/object_manager/plasma/common.h index a4e8f8337372b..d74eb88cec8b8 100644 --- a/src/ray/object_manager/plasma/common.h +++ b/src/ray/object_manager/plasma/common.h @@ -123,14 +123,21 @@ class LocalObject { const plasma::flatbuf::ObjectSource &GetSource() const { return source; } + ray::PlasmaObjectHeader *GetPlasmaObjectHeader() const { + auto header_ptr = static_cast(allocation.address); + return reinterpret_cast(header_ptr); + } + void ToPlasmaObject(PlasmaObject *object, bool check_sealed) const { RAY_DCHECK(object != nullptr); if (check_sealed) { RAY_DCHECK(Sealed()); } object->store_fd = GetAllocation().fd; - object->data_offset = GetAllocation().offset; - object->metadata_offset = GetAllocation().offset + GetObjectInfo().data_size; + object->header_offset = GetAllocation().offset; + object->data_offset = GetAllocation().offset + sizeof(ray::PlasmaObjectHeader); + object->metadata_offset = GetAllocation().offset + sizeof(ray::PlasmaObjectHeader) + + GetObjectInfo().data_size; object->data_size = GetObjectInfo().data_size; object->metadata_size = GetObjectInfo().metadata_size; object->device_num = GetAllocation().device_num; diff --git a/src/ray/object_manager/plasma/object_store.cc b/src/ray/object_manager/plasma/object_store.cc index a36ad1d54906a..260951f178567 100644 --- a/src/ray/object_manager/plasma/object_store.cc +++ b/src/ray/object_manager/plasma/object_store.cc @@ -47,6 +47,10 @@ const LocalObject *ObjectStore::CreateObject(const ray::ObjectInfo &object_info, entry->construct_duration = -1; entry->source = source; + auto plasma_header = entry->GetPlasmaObjectHeader(); + *plasma_header = ray::PlasmaObjectHeader{}; + plasma_header->Init(); + RAY_LOG(DEBUG) << "create object " << object_info.object_id << " succeeded"; return entry; } @@ -74,6 +78,10 @@ bool ObjectStore::DeleteObject(const ObjectID &object_id) { if (entry == nullptr) { return false; } + // TODO(swang): Make sure Seal coroutine is done before deleting. + auto plasma_header = entry->GetPlasmaObjectHeader(); + plasma_header->Destroy(); + allocator_.Free(std::move(entry->allocation)); object_table_.erase(object_id); return true; diff --git a/src/ray/object_manager/plasma/plasma.fbs b/src/ray/object_manager/plasma/plasma.fbs index e5e7714aebc20..68177465f3366 100644 --- a/src/ray/object_manager/plasma/plasma.fbs +++ b/src/ray/object_manager/plasma/plasma.fbs @@ -96,6 +96,8 @@ struct PlasmaObjectSpec { segment_index: int; // The unique id of the segment fd in case of fd reuse. unique_fd_id: long; + // The offset in bytes in the memory mapped file of the plasma object header. + header_offset: ulong; // The offset in bytes in the memory mapped file of the data. data_offset: ulong; // The size in bytes of the data. diff --git a/src/ray/object_manager/plasma/plasma.h b/src/ray/object_manager/plasma/plasma.h index 0f8a00b061424..775226c922665 100644 --- a/src/ray/object_manager/plasma/plasma.h +++ b/src/ray/object_manager/plasma/plasma.h @@ -37,6 +37,9 @@ struct PlasmaObject { /// a unique identifier of the file in the client to look up the corresponding /// file descriptor on the client's side. MEMFD_TYPE store_fd; + /// The offset in bytes in the memory mapped file of the plasma object + /// header. + ptrdiff_t header_offset; /// The offset in bytes in the memory mapped file of the data. ptrdiff_t data_offset; /// The offset in bytes in the memory mapped file of the metadata. diff --git a/src/ray/object_manager/plasma/plasma_allocator.cc b/src/ray/object_manager/plasma/plasma_allocator.cc index 3737024ab416a..06cdb20bf3d5d 100644 --- a/src/ray/object_manager/plasma/plasma_allocator.cc +++ b/src/ray/object_manager/plasma/plasma_allocator.cc @@ -75,7 +75,7 @@ PlasmaAllocator::PlasmaAllocator(const std::string &plasma_directory, auto allocation = Allocate(kFootprintLimit - kDlMallocReserved); RAY_CHECK(allocation.has_value()) << "PlasmaAllocator initialization failed." - << " It's likely we don't have enought space in " << plasma_directory; + << " It's likely we don't have enough space in " << plasma_directory; // This will unmap the file, but the next one created will be as large // as this one (this is an implementation detail of dlmalloc). Free(std::move(allocation.value())); diff --git a/src/ray/object_manager/plasma/protocol.cc b/src/ray/object_manager/plasma/protocol.cc index 50595cde53701..79b9a27827fb1 100644 --- a/src/ray/object_manager/plasma/protocol.cc +++ b/src/ray/object_manager/plasma/protocol.cc @@ -260,6 +260,7 @@ Status SendCreateReply(const std::shared_ptr &client, flatbuffers::FlatBufferBuilder fbb; PlasmaObjectSpec plasma_object(FD2INT(object.store_fd.first), object.store_fd.second, + object.header_offset, object.data_offset, object.data_size, object.metadata_offset, @@ -300,6 +301,7 @@ Status ReadCreateReply(uint8_t *data, object->store_fd.first = INT2FD(message->plasma_object()->segment_index()); object->store_fd.second = message->plasma_object()->unique_fd_id(); + object->header_offset = message->plasma_object()->header_offset(); object->data_offset = message->plasma_object()->data_offset(); object->data_size = message->plasma_object()->data_size(); object->metadata_offset = message->plasma_object()->metadata_offset(); @@ -614,6 +616,7 @@ Status SendGetReply(const std::shared_ptr &client, << " metadata_size: " << object.metadata_size; objects.push_back(PlasmaObjectSpec(FD2INT(object.store_fd.first), object.store_fd.second, + object.header_offset, object.data_offset, object.data_size, object.metadata_offset, @@ -654,6 +657,7 @@ Status ReadGetReply(uint8_t *data, const PlasmaObjectSpec *object = message->plasma_objects()->Get(i); plasma_objects[i].store_fd.first = INT2FD(object->segment_index()); plasma_objects[i].store_fd.second = object->unique_fd_id(); + plasma_objects[i].header_offset = object->header_offset(); plasma_objects[i].data_offset = object->data_offset(); plasma_objects[i].data_size = object->data_size(); plasma_objects[i].metadata_offset = object->metadata_offset(); diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc index 66876de42cbcc..e948e885aecfa 100644 --- a/src/ray/object_manager/plasma/store.cc +++ b/src/ray/object_manager/plasma/store.cc @@ -31,7 +31,9 @@ #include #include #include +#include +#include #include #include #include @@ -525,11 +527,65 @@ void PlasmaStore::ReplyToCreateClient(const std::shared_ptr &client, error == PlasmaError::OK && result.device_num == 0) { static_cast(client->SendFd(result.store_fd)); } + + WaitForSeal(object_id, client); } else { static_cast(SendUnfinishedCreateReply(client, object_id, req_id)); } } +void PlasmaStore::WaitForSeal(const ObjectID &object_id, + const std::shared_ptr &client) { + auto entry = object_lifecycle_mgr_.GetObject(object_id); + RAY_CHECK(entry); + auto plasma_header = entry->GetPlasmaObjectHeader(); + + int event_fd = eventfd(0, EFD_CLOEXEC); + RAY_CHECK(event_fd != -1); + + auto wait_fn = [event_fd, plasma_header]() { + plasma_header->ReadAcquire(/*read_version=*/1); + + uint64_t data = 1; + auto num_bytes_written = write(event_fd, &data, sizeof(data)); + // TODO(swang): Need proper error checking here. + if (num_bytes_written != sizeof(data)) { + RAY_LOG(WARNING) << num_bytes_written << " bytes written on fd " << event_fd + << " err: " << strerror(errno); + } + }; + + auto wait_thread = std::make_shared(wait_fn); + + boost::asio::spawn( + io_context_, + [this, event_fd, object_id, plasma_header, wait_thread, client]( + boost::asio::yield_context yield) { + auto event_stream = std::make_shared( + io_context_, event_fd); + auto data = std::make_shared(0); + auto buf = boost::asio::buffer(data.get(), sizeof(*data)); + boost::asio::async_read( + *event_stream, + buf, + [this, event_stream, data, object_id, event_fd, wait_thread]( + const boost::system::error_code &ec, size_t bytes_transferred) { + RAY_CHECK(bytes_transferred == sizeof(*data)) << ec.message(); + + // RAY_CHECK(plasma_header->num_readers == -1) << + // plasma_header->num_readers; + + { + absl::MutexLock lock(&mutex_); + SealObjects({object_id}); + } + + wait_thread->join(); + close(event_fd); + }); + }); +} + int64_t PlasmaStore::GetConsumedBytes() { return total_consumed_bytes_; } bool PlasmaStore::IsObjectSpillable(const ObjectID &object_id) { diff --git a/src/ray/object_manager/plasma/store.h b/src/ray/object_manager/plasma/store.h index a6c992c131280..74c33edb9a2c6 100644 --- a/src/ray/object_manager/plasma/store.h +++ b/src/ray/object_manager/plasma/store.h @@ -118,6 +118,8 @@ class PlasmaStore { return available; } + void WaitForSeal(const ObjectID &object_id, const std::shared_ptr &client); + private: /// Create a new object. The client must do a call to release_object to tell /// the store when it is done with the object. From 1c935b9a62b694578a2123bb24608b514391fab9 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 29 Nov 2023 15:57:05 -0800 Subject: [PATCH 06/66] Add special calls for create and put mutable objects Signed-off-by: Stephanie Wang --- python/ray/__init__.py | 2 + python/ray/_private/worker.py | 57 +++++- python/ray/_raylet.pxd | 3 +- python/ray/_raylet.pyx | 49 ++++-- python/ray/includes/libcoreworker.pxd | 13 +- python/ray/tests/test_accelerated_dag.py | 10 +- src/ray/core_worker/core_worker.cc | 25 ++- src/ray/core_worker/core_worker.h | 13 +- .../store_provider/plasma_store_provider.cc | 23 ++- .../store_provider/plasma_store_provider.h | 11 +- src/ray/object_manager/common.cc | 14 +- src/ray/object_manager/common.h | 5 +- src/ray/object_manager/object_buffer_pool.cc | 1 + src/ray/object_manager/plasma/client.cc | 163 +++++++++++------- src/ray/object_manager/plasma/client.h | 20 ++- src/ray/object_manager/plasma/object_store.cc | 7 + src/ray/object_manager/plasma/plasma.fbs | 2 + src/ray/object_manager/plasma/protocol.cc | 3 + src/ray/object_manager/plasma/protocol.h | 1 + src/ray/object_manager/plasma/store.cc | 54 ++---- src/ray/object_manager/plasma/store.h | 2 + 21 files changed, 324 insertions(+), 154 deletions(-) diff --git a/python/ray/__init__.py b/python/ray/__init__.py index 031f8054cf8e7..d95d2f5a20c5b 100644 --- a/python/ray/__init__.py +++ b/python/ray/__init__.py @@ -114,6 +114,8 @@ def _configure_system(): WORKER_MODE, RESTORE_WORKER_MODE, SPILL_WORKER_MODE, + _create_mutable_object, + _put_mutable_object, cancel, get, get_actor, diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py index 94ab25be014b0..4e5b08f6dedb1 100644 --- a/python/ray/_private/worker.py +++ b/python/ray/_private/worker.py @@ -688,7 +688,7 @@ def set_mode(self, mode): def set_load_code_from_local(self, load_code_from_local): self._load_code_from_local = load_code_from_local - def put_object(self, value, object_ref=None, owner_address=None, max_readers=-1): + def put_object(self, value, object_ref=None, owner_address=None, is_mutable=False): """Put value in the local object store with object reference `object_ref`. This assumes that the value for `object_ref` has not yet been placed in @@ -736,6 +736,11 @@ def put_object(self, value, object_ref=None, owner_address=None, max_readers=-1) f"{sio.getvalue()}" ) raise TypeError(msg) from e + + # If the object is mutable, then the raylet should never read the + # object. Instead, clients will keep the object pinned. + pin_object = not is_mutable + # This *must* be the first place that we construct this python # ObjectRef because an entry with 0 local references is created when # the object is Put() in the core worker, expecting that this python @@ -746,8 +751,9 @@ def put_object(self, value, object_ref=None, owner_address=None, max_readers=-1) self.core_worker.put_serialized_object_and_increment_local_ref( serialized_value, object_ref=object_ref, + pin_object=pin_object, owner_address=owner_address, - max_readers=max_readers, + is_mutable=is_mutable, ), # The initial local reference is already acquired internally. skip_adding_local_ref=True, @@ -2629,13 +2635,54 @@ def get( return values +@PublicAPI +def _put_mutable_object(value: Any, object_ref: ObjectRef, num_readers: int): + worker = global_worker + worker.check_connected() + + try: + serialized_value = worker.get_serialization_context().serialize(value) + except TypeError as e: + sio = io.StringIO() + ray.util.inspect_serializability(value, print_file=sio) + msg = ( + "Could not serialize the put value " f"{repr(value)}:\n" f"{sio.getvalue()}" + ) + raise TypeError(msg) from e + + worker.core_worker.put_serialized_object_to_mutable_plasma_object( + serialized_value, + object_ref, + num_readers, + ) + + +@PublicAPI +def _create_mutable_object( + buffer_size: int, +) -> "ray.ObjectRef": + worker = global_worker + worker.check_connected() + + value = b"0" * buffer_size + + try: + object_ref = worker.put_object(value, owner_address=None, is_mutable=True) + except ObjectStoreFullError: + logger.info( + "Put failed since the value was either too large or the " + "store was full of pinned objects." + ) + raise + return object_ref + + @PublicAPI @client_mode_hook def put( value: Any, *, _owner: Optional["ray.actor.ActorHandle"] = None, - max_readers=-1, ) -> "ray.ObjectRef": """Store an object in the object store. @@ -2681,9 +2728,7 @@ def put( with profiling.profile("ray.put"): try: - object_ref = worker.put_object( - value, owner_address=serialize_owner_address, max_readers=max_readers - ) + object_ref = worker.put_object(value, owner_address=serialize_owner_address) except ObjectStoreFullError: logger.info( "Put failed since the value was either too large or the " diff --git a/python/ray/_raylet.pxd b/python/ray/_raylet.pxd index 015c636454dfe..f4f54cffacec0 100644 --- a/python/ray/_raylet.pxd +++ b/python/ray/_raylet.pxd @@ -134,7 +134,8 @@ cdef class CoreWorker: CObjectID *c_object_id, shared_ptr[CBuffer] *data, c_bool created_by_worker, owner_address=*, - c_bool inline_small_object=*) + c_bool inline_small_object=*, + c_bool is_mutable=*) cdef unique_ptr[CAddress] _convert_python_address(self, address=*) cdef store_task_output( self, serialized_object, diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index e55a29dd08226..3aa2422180c07 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -3373,7 +3373,9 @@ cdef class CoreWorker: CObjectID *c_object_id, shared_ptr[CBuffer] *data, c_bool created_by_worker, owner_address=None, - c_bool inline_small_object=True): + c_bool inline_small_object=True, + c_bool is_mutable=False, + ): cdef: unique_ptr[CAddress] c_owner_address @@ -3383,7 +3385,7 @@ cdef class CoreWorker: with nogil: check_status(CCoreWorkerProcess.GetCoreWorker() .CreateOwnedAndIncrementLocalRef( - metadata, data_size, contained_ids, + is_mutable, metadata, data_size, contained_ids, c_object_id, data, created_by_worker, move(c_owner_address), inline_small_object)) @@ -3470,15 +3472,42 @@ cdef class CoreWorker: CCoreWorkerProcess.GetCoreWorker().SealExisting( c_object_id, pin_object=False, generator_id=CObjectID.Nil(), - owner_address=c_owner_address, - max_readers=-1)) + owner_address=c_owner_address)) + + def put_serialized_object_to_mutable_plasma_object(self, serialized_object, + ObjectRef object_ref, + num_readers, + ): + cdef: + CObjectID c_object_id = object_ref.native() + shared_ptr[CBuffer] data + unique_ptr[CAddress] null_owner_address + + metadata = string_to_buffer(serialized_object.metadata) + data_size = serialized_object.total_bytes + check_status(CCoreWorkerProcess.GetCoreWorker().WriteAcquireMutableObject( + c_object_id, + metadata, + data_size, + num_readers, + &data, + )) + if data_size > 0: + (serialized_object).write_to( + Buffer.make(data)) + check_status( + CCoreWorkerProcess.GetCoreWorker().SealExisting( + c_object_id, pin_object=False, + generator_id=CObjectID.Nil(), + owner_address=null_owner_address)) def put_serialized_object_and_increment_local_ref(self, serialized_object, ObjectRef object_ref=None, c_bool pin_object=True, owner_address=None, c_bool inline_small_object=True, - max_readers=-1): + c_bool is_mutable=False, + ): cdef: CObjectID c_object_id shared_ptr[CBuffer] data @@ -3486,7 +3515,6 @@ cdef class CoreWorker: unique_ptr[CAddress] c_owner_address c_vector[CObjectID] contained_object_ids c_vector[CObjectReference] contained_object_refs - int64_t c_max_readers = max_readers metadata = string_to_buffer(serialized_object.metadata) total_bytes = serialized_object.total_bytes @@ -3495,7 +3523,8 @@ cdef class CoreWorker: object_already_exists = self._create_put_buffer( metadata, total_bytes, object_ref, contained_object_ids, - &c_object_id, &data, True, owner_address, inline_small_object) + &c_object_id, &data, True, owner_address, inline_small_object, + is_mutable) logger.debug( f"Serialized object size of {c_object_id.Hex()} is {total_bytes} bytes") @@ -3524,8 +3553,7 @@ cdef class CoreWorker: CCoreWorkerProcess.GetCoreWorker().SealOwned( c_object_id, pin_object, - move(c_owner_address), - c_max_readers)) + move(c_owner_address))) else: # Using custom object refs is not supported because we # can't track their lifecycle, so we don't pin the @@ -3534,8 +3562,7 @@ cdef class CoreWorker: CCoreWorkerProcess.GetCoreWorker().SealExisting( c_object_id, pin_object=False, generator_id=CObjectID.Nil(), - owner_address=move(c_owner_address), - max_readers=c_max_readers)) + owner_address=move(c_owner_address))) return c_object_id.Binary() diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index 28fa6375212bb..bf6a6e810fc9d 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -226,6 +226,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: const c_vector[CObjectID] &contained_object_ids, const CObjectID &object_id) CRayStatus CreateOwnedAndIncrementLocalRef( + c_bool is_mutable, const shared_ptr[CBuffer] &metadata, const size_t data_size, const c_vector[CObjectID] &contained_object_ids, @@ -239,13 +240,17 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: const CAddress &owner_address, shared_ptr[CBuffer] *data, c_bool created_by_worker) + CRayStatus WriteAcquireMutableObject( + const CObjectID &object_id, + const shared_ptr[CBuffer] &metadata, + uint64_t data_size, + int64_t num_readers, + shared_ptr[CBuffer] *data) CRayStatus SealOwned(const CObjectID &object_id, c_bool pin_object, - const unique_ptr[CAddress] &owner_address, - int64_t max_readers) + const unique_ptr[CAddress] &owner_address) CRayStatus SealExisting(const CObjectID &object_id, c_bool pin_object, const CObjectID &generator_id, - const unique_ptr[CAddress] &owner_address, - int64_t max_readers) + const unique_ptr[CAddress] &owner_address) CRayStatus GetRelease(const c_vector[CObjectID] &object_ids) CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms, c_vector[shared_ptr[CRayObject]] *results) diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py index 8f6286a3c5351..405d78d46cc9b 100644 --- a/python/ray/tests/test_accelerated_dag.py +++ b/python/ray/tests/test_accelerated_dag.py @@ -12,12 +12,10 @@ def test_put_mutable_object(ray_start_cluster): - # ref = ray.create_mutable_object(size_bytes=1000) - - max_readers = 1 - arr = b"binary" - ref = ray.put(arr, max_readers=max_readers) - ray.release(ref) + ray.init() + ref = ray._create_mutable_object(1000) + ray._put_mutable_object(b"hello", ref, num_readers=1) + assert ray.get(ref) == b"hello" if __name__ == "__main__": diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 586b69714eb35..72f5b698d9234 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1219,6 +1219,7 @@ Status CoreWorker::Put(const RayObject &object, } Status CoreWorker::CreateOwnedAndIncrementLocalRef( + bool is_mutable, const std::shared_ptr &metadata, const size_t data_size, const std::vector &contained_object_ids, @@ -1293,7 +1294,8 @@ Status CoreWorker::CreateOwnedAndIncrementLocalRef( *object_id, /* owner_address = */ real_owner_address, data, - created_by_worker); + created_by_worker, + is_mutable); } if (!status.ok()) { RemoveLocalReference(*object_id); @@ -1324,12 +1326,20 @@ Status CoreWorker::CreateExisting(const std::shared_ptr &metadata, } } +Status CoreWorker::WriteAcquireMutableObject(const ObjectID &object_id, + const std::shared_ptr &metadata, + uint64_t data_size, + int64_t num_readers, + std::shared_ptr *data) { + return plasma_store_provider_->WriteAcquireMutableObject( + object_id, metadata, data_size, num_readers, data); +} + Status CoreWorker::SealOwned(const ObjectID &object_id, bool pin_object, - const std::unique_ptr &owner_address, - int64_t max_readers) { - auto status = SealExisting( - object_id, pin_object, ObjectID::Nil(), std::move(owner_address), max_readers); + const std::unique_ptr &owner_address) { + auto status = + SealExisting(object_id, pin_object, ObjectID::Nil(), std::move(owner_address)); if (status.ok()) return status; RemoveLocalReference(object_id); if (reference_counter_->HasReference(object_id)) { @@ -1343,9 +1353,8 @@ Status CoreWorker::SealOwned(const ObjectID &object_id, Status CoreWorker::SealExisting(const ObjectID &object_id, bool pin_object, const ObjectID &generator_id, - const std::unique_ptr &owner_address, - int64_t max_readers) { - RAY_RETURN_NOT_OK(plasma_store_provider_->Seal(object_id, max_readers)); + const std::unique_ptr &owner_address) { + RAY_RETURN_NOT_OK(plasma_store_provider_->Seal(object_id)); if (pin_object) { // Tell the raylet to pin the object **after** it is created. RAY_LOG(DEBUG) << "Pinning sealed object " << object_id; diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index d4f621acd6380..3db71661d8695 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -614,6 +614,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// small. /// \return Status. Status CreateOwnedAndIncrementLocalRef( + bool is_mutable, const std::shared_ptr &metadata, const size_t data_size, const std::vector &contained_object_ids, @@ -642,6 +643,12 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { std::shared_ptr *data, bool created_by_worker); + Status WriteAcquireMutableObject(const ObjectID &object_id, + const std::shared_ptr &metadata, + uint64_t data_size, + int64_t num_readers, + std::shared_ptr *data); + /// Finalize placing an object into the object store. This should be called after /// a corresponding `CreateOwned()` call and then writing into the returned buffer. /// @@ -656,8 +663,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// \return Status. Status SealOwned(const ObjectID &object_id, bool pin_object, - const std::unique_ptr &owner_address = nullptr, - int64_t max_readers = -1); + const std::unique_ptr &owner_address = nullptr); /// Finalize placing an object into the object store. This should be called after /// a corresponding `CreateExisting()` call and then writing into the returned buffer. @@ -674,8 +680,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { Status SealExisting(const ObjectID &object_id, bool pin_object, const ObjectID &generator_id = ObjectID::Nil(), - const std::unique_ptr &owner_address = nullptr, - int64_t max_readers = -1); + const std::unique_ptr &owner_address = nullptr); Status GetRelease(const std::vector &object_ids); diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.cc b/src/ray/core_worker/store_provider/plasma_store_provider.cc index eb440a668a9dd..30ae14daef662 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.cc +++ b/src/ray/core_worker/store_provider/plasma_store_provider.cc @@ -108,12 +108,27 @@ Status CoreWorkerPlasmaStoreProvider::Put(const RayObject &object, return Status::OK(); } +Status CoreWorkerPlasmaStoreProvider::WriteAcquireMutableObject( + const ObjectID &object_id, + const std::shared_ptr &metadata, + uint64_t data_size, + int64_t num_readers, + std::shared_ptr *data) { + return store_client_.WriteAcquireMutableObject(object_id, + data_size, + metadata ? metadata->Data() : nullptr, + metadata ? metadata->Size() : 0, + num_readers, + data); +} + Status CoreWorkerPlasmaStoreProvider::Create(const std::shared_ptr &metadata, const size_t data_size, const ObjectID &object_id, const rpc::Address &owner_address, std::shared_ptr *data, - bool created_by_worker) { + bool created_by_worker, + bool is_mutable) { auto source = plasma::flatbuf::ObjectSource::CreatedByWorker; if (!created_by_worker) { source = plasma::flatbuf::ObjectSource::RestoredFromStorage; @@ -121,6 +136,7 @@ Status CoreWorkerPlasmaStoreProvider::Create(const std::shared_ptr &meta Status status = store_client_.CreateAndSpillIfNeeded(object_id, owner_address, + is_mutable, data_size, metadata ? metadata->Data() : nullptr, metadata ? metadata->Size() : 0, @@ -153,9 +169,8 @@ Status CoreWorkerPlasmaStoreProvider::Create(const std::shared_ptr &meta return status; } -Status CoreWorkerPlasmaStoreProvider::Seal(const ObjectID &object_id, - int64_t max_readers) { - return store_client_.Seal(object_id, max_readers); +Status CoreWorkerPlasmaStoreProvider::Seal(const ObjectID &object_id) { + return store_client_.Seal(object_id); } Status CoreWorkerPlasmaStoreProvider::Release(const ObjectID &object_id) { diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.h b/src/ray/core_worker/store_provider/plasma_store_provider.h index 523aa86a3e5f0..2c7242a02f4a1 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.h +++ b/src/ray/core_worker/store_provider/plasma_store_provider.h @@ -126,7 +126,14 @@ class CoreWorkerPlasmaStoreProvider { const ObjectID &object_id, const rpc::Address &owner_address, std::shared_ptr *data, - bool created_by_worker); + bool created_by_worker, + bool is_mutable = false); + + Status WriteAcquireMutableObject(const ObjectID &object_id, + const std::shared_ptr &metadata, + uint64_t data_size, + int64_t num_readers, + std::shared_ptr *data); /// Seal an object buffer created with Create(). /// @@ -135,7 +142,7 @@ class CoreWorkerPlasmaStoreProvider { /// /// \param[in] object_id The ID of the object. This can be used as an /// argument to Get to retrieve the object data. - Status Seal(const ObjectID &object_id, int64_t max_readers = -1); + Status Seal(const ObjectID &object_id); /// Release the first reference to the object created by Put() or Create(). This should /// be called exactly once per object and until it is called, the object is pinned and diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc index 4eff0d3e583b4..24e906d9ba4d5 100644 --- a/src/ray/object_manager/common.cc +++ b/src/ray/object_manager/common.cc @@ -67,12 +67,10 @@ void PlasmaObjectHeader::WriteAcquire(int64_t write_version, uint64_t new_size) RAY_CHECK(pthread_mutex_unlock(&wr_mut) == 0); } -void PlasmaObjectHeader::WriteRelease(int64_t write_version, int64_t write_num_readers) { - RAY_LOG(DEBUG) << "WriteRelease Waiting. version: " << write_version - << " max readers: " << write_num_readers; +void PlasmaObjectHeader::WriteRelease(int64_t write_version) { + RAY_LOG(DEBUG) << "WriteRelease Waiting. version: " << write_version; RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0); - RAY_LOG(DEBUG) << "WriteRelease " << write_version - << " max readers: " << write_num_readers; + RAY_LOG(DEBUG) << "WriteRelease " << write_version; PrintPlasmaObjectHeader(this); RAY_CHECK(version == write_version) @@ -80,11 +78,11 @@ void PlasmaObjectHeader::WriteRelease(int64_t write_version, int64_t write_num_r << version << ". Are you sure this is the only writer?"; version = write_version; - num_readers = write_num_readers; + RAY_CHECK(num_readers != 0); num_read_acquires_remaining = num_readers; num_read_releases_remaining = num_readers; - RAY_LOG(DEBUG) << "WriteRelease done"; + RAY_LOG(DEBUG) << "WriteRelease done, num_readers: " << num_readers; PrintPlasmaObjectHeader(this); RAY_CHECK(pthread_mutex_unlock(&wr_mut) == 0); // Signal to all readers. @@ -92,7 +90,7 @@ void PlasmaObjectHeader::WriteRelease(int64_t write_version, int64_t write_num_r } int64_t PlasmaObjectHeader::ReadAcquire(int64_t read_version) { - RAY_LOG(DEBUG) << "ReadAcquire Waiting" << read_version; + RAY_LOG(DEBUG) << "ReadAcquire waiting version " << read_version; RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0); RAY_LOG(DEBUG) << "ReadAcquire " << read_version; PrintPlasmaObjectHeader(this); diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h index 23634cbae7d35..395c86ee8223b 100644 --- a/src/ray/object_manager/common.h +++ b/src/ray/object_manager/common.h @@ -83,8 +83,9 @@ struct PlasmaObjectHeader { /// \param new_size The new data size of the object. void WriteAcquire(int64_t write_version, uint64_t new_data_size); - // Call after completing a write to signal to num_readers many readers. - void WriteRelease(int64_t write_version, int64_t num_readers); + // Call after completing a write to signal that readers may read. + // num_readers should be set before calling this. + void WriteRelease(int64_t write_version); // Blocks until the given version or a more recent version is ready to read. // diff --git a/src/ray/object_manager/object_buffer_pool.cc b/src/ray/object_manager/object_buffer_pool.cc index 8004fb588811d..a42a921fc50a7 100644 --- a/src/ray/object_manager/object_buffer_pool.cc +++ b/src/ray/object_manager/object_buffer_pool.cc @@ -241,6 +241,7 @@ ray::Status ObjectBufferPool::EnsureBufferExists(const ObjectID &object_id, Status s = store_client_->CreateAndSpillIfNeeded( object_id, owner_address, + /*is_mutable=*/false, static_cast(object_size), nullptr, static_cast(metadata_size), diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index 31a507cb3dc92..57c62bf486df9 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -95,7 +95,7 @@ struct ObjectInUseEntry { PlasmaObject object; /// A flag representing whether the object has been sealed. bool is_sealed; - bool is_shared = false; + bool is_mutable = false; /// For shared objects only. /// The last version that we read or wrote. To read or write again, we must /// pass a newer version than this. @@ -119,6 +119,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this *data); + Status Get(const std::vector &object_ids, int64_t timeout_ms, std::vector *object_buffers, @@ -160,7 +168,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this &object_ids); @@ -371,6 +379,7 @@ Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id, // client is using. A call to PlasmaClient::Release is required to decrement // this count. Cache the reference to the object. IncrementObjectCount(object_id, &object, false); + // TODO(swang): Remove the second increment call. // We increment the count a second time (and the corresponding decrement will // happen in a PlasmaClient::Release call in plasma_seal) so even if the // buffer returned by PlasmaClient::Create goes out of scope, the object does @@ -379,8 +388,57 @@ Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id, return Status::OK(); } +Status PlasmaClient::Impl::WriteAcquireMutableObject(const ObjectID &object_id, + int64_t data_size, + const uint8_t *metadata, + int64_t metadata_size, + int64_t num_readers, + std::shared_ptr *data) { + std::unique_lock guard(client_mutex_); + auto object_entry = objects_in_use_.find(object_id); + RAY_CHECK(object_entry != objects_in_use_.end()); + + auto &entry = object_entry->second; + RAY_CHECK(entry->is_mutable); + RAY_CHECK(entry->is_sealed) << "Must Seal before writing again to a mutable object"; + + RAY_LOG(DEBUG) << "Write mutable object " << object_id; + + // Wait for no readers. + auto plasma_header = GetPlasmaObjectHeader(entry->object); + // NOTE: entry->object.data_size is the size of the data buffer. + // When the object is shared, we can have object size smaller than the data buffer. + // TODO(swang): Better exception. + // TODO(swang): Support data size larger than allocated buffer. + RAY_CHECK(data_size <= entry->object.data_size) + << "Cannot write mutable data size " << data_size + << " larger than allocated buffer size " << entry->object.data_size; + // TODO(swang): Support different metadata size. + RAY_CHECK(metadata_size == entry->object.metadata_size) + << "Metadata size must stay the same"; + plasma_header->WriteAcquire(entry->next_version_to_write, data_size); + plasma_header->num_readers = num_readers; + + // Prepare the data buffer and return to the client instead of sending + // the IPC to object store. + *data = std::make_shared( + shared_from_this(), + GetStoreFdAndMmap(entry->object.store_fd, entry->object.mmap_size) + + entry->object.data_offset, + data_size); + if (metadata != NULL) { + // Copy the metadata to the buffer. + memcpy( + (*data)->Data() + entry->object.data_size, metadata, entry->object.metadata_size); + } + + entry->is_sealed = false; + return Status::OK(); +} + Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, const ray::rpc::Address &owner_address, + bool is_mutable, int64_t data_size, const uint8_t *metadata, int64_t metadata_size, @@ -388,44 +446,6 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, fb::ObjectSource source, int device_num) { std::unique_lock guard(client_mutex_); - auto object_entry = objects_in_use_.find(object_id); - if (object_entry != objects_in_use_.end()) { - auto &entry = object_entry->second; - if (entry->is_sealed && entry->is_shared) { - RAY_LOG(DEBUG) << "Create shared object " << object_id << " exists"; - // Wait for no readers. - auto plasma_header = GetPlasmaObjectHeader(entry->object); - // TODO(sang) - // NOTE: entry->object.data_size is the size of the data buffer. - // When the object is shared, we can have object size smaller than the data buffer. - RAY_LOG(DEBUG) << "SANG-TODO Update the data size of " << object_id - << ". Size: " << data_size; - auto next_version_to_write = plasma_header->version + 1; - plasma_header->WriteAcquire(next_version_to_write, data_size); - - // Prepare the data buffer and return to the client instead of sending - // the IPC to object store. - *data = std::make_shared( - shared_from_this(), - GetStoreFdAndMmap(entry->object.store_fd, entry->object.mmap_size) + - entry->object.data_offset, - entry->object.data_size); - // If plasma_create is being called from a transfer, then we will not copy the - // metadata here. The metadata will be written along with the data streamed - // from the transfer. - if (metadata != NULL) { - // Copy the metadata to the buffer. - memcpy((*data)->Data() + entry->object.data_size, - metadata, - entry->object.metadata_size); - } - - entry->is_sealed = false; - IncrementObjectCount(object_id, &entry->object, false); - } - return Status::OK(); - } - uint64_t retry_with_request_id = 0; RAY_LOG(DEBUG) << "called plasma_create on conn " << store_conn_ << " with size " @@ -433,6 +453,7 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, RAY_RETURN_NOT_OK(SendCreateRequest(store_conn_, object_id, owner_address, + is_mutable, data_size, metadata_size, source, @@ -454,16 +475,28 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, if (status.ok()) { // Create IPC was successful. - object_entry = objects_in_use_.find(object_id); + auto object_entry = objects_in_use_.find(object_id); RAY_CHECK(object_entry != objects_in_use_.end()); auto &entry = object_entry->second; RAY_CHECK(!entry->is_sealed); + entry->is_mutable = is_mutable; + auto plasma_header = GetPlasmaObjectHeader(entry->object); // The corresponding WriteRelease takes place in Seal. // When an object is first created, the data size is equivalent to // buffer size. // The first creation's version is always 1. - plasma_header->WriteAcquire(/*next_version_to_write*/ 1, entry->object.data_size); + RAY_CHECK(entry->next_version_to_write == 1); + plasma_header->WriteAcquire(/*next_version_to_write*/ entry->next_version_to_write, + entry->object.data_size); + if (entry->is_mutable) { + // The plasma store is the first reader. Once it read-releases, the + // writer may write an actual value. + plasma_header->num_readers = 1; + } else { + // Anyone may read. + plasma_header->num_readers = -1; + } } return status; @@ -494,6 +527,7 @@ Status PlasmaClient::Impl::TryCreateImmediately(const ObjectID &object_id, RAY_RETURN_NOT_OK(SendCreateRequest(store_conn_, object_id, owner_address, + /*is_mutable=*/false, data_size, metadata_size, source, @@ -537,7 +571,7 @@ Status PlasmaClient::Impl::GetBuffers( auto data_size = plasma_header->GetDataSize(); RAY_LOG(DEBUG) << "SANG-TODO data size is " << data_size; if (version_read > 0) { - object_entry->second->is_shared = true; + object_entry->second->is_mutable = true; object_entry->second->next_version_to_read = version_read; } @@ -616,7 +650,7 @@ Status PlasmaClient::Impl::GetBuffers( int64_t version_read = plasma_header->ReadAcquire(/*version=*/1); auto data_size = plasma_header->GetDataSize(); if (version_read > 0) { - object_entry->is_shared = true; + object_entry->is_mutable = true; object_entry->next_version_to_read = version_read; } @@ -670,9 +704,13 @@ Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) { } auto &entry = object_entry->second; - // RAY_CHECK(entry->is_sealed && entry->is_shared) << "ray.release must be called on " - // "objects that are sealed and shared. sealed? " << entry->is_sealed - // << " shared " << entry->is_shared; + if (!entry->is_sealed) { + return Status::ObjectNotFound("ray.release() called on an object that is not sealed"); + } + if (!entry->is_mutable) { + return Status::ObjectNotFound( + "ray.release() called on an object that is not mutable"); + } RAY_LOG(DEBUG) << "Release shared object " << object_id; auto plasma_header = GetPlasmaObjectHeader(entry->object); @@ -707,7 +745,7 @@ Status PlasmaClient::Impl::Release(const ObjectID &object_id) { RAY_CHECK(object_entry->second->count >= 0); // Check if the client is no longer using this object. // TODO(swang): Nicer way to pin shared objects. - if (object_entry->second->count == 0 && !object_entry->second->is_shared) { + if (object_entry->second->count == 0 && !object_entry->second->is_mutable) { // object_entry is invalidated in MarkObjectUnused, need to read the fd beforehand. MEMFD_TYPE fd = object_entry->second->object.store_fd; // Tell the store that the client no longer needs the object. @@ -763,7 +801,7 @@ Status PlasmaClient::Impl::Contains(const ObjectID &object_id, bool *has_object) return Status::OK(); } -Status PlasmaClient::Impl::Seal(const ObjectID &object_id, int64_t num_readers) { +Status PlasmaClient::Impl::Seal(const ObjectID &object_id) { std::lock_guard guard(client_mutex_); // Make sure this client has a reference to the object before sending the @@ -778,15 +816,10 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id, int64_t num_readers) } auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object); - // The value should've already updated when object is created. - auto next_version_to_write = plasma_header->version; plasma_header->WriteRelease( - /*write_version=*/next_version_to_write, num_readers); - object_entry->second->next_version_to_write = next_version_to_write; - - if (num_readers != -1) { - object_entry->second->is_shared = true; - } + /*write_version=*/object_entry->second->next_version_to_write); + // The next Write must pass a higher version. + object_entry->second->next_version_to_write++; object_entry->second->is_sealed = true; //// Send the seal request to Plasma. // RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id)); @@ -932,8 +965,19 @@ Status PlasmaClient::Connect(const std::string &store_socket_name, store_socket_name, manager_socket_name, release_delay, num_retries); } +Status PlasmaClient::WriteAcquireMutableObject(const ObjectID &object_id, + int64_t data_size, + const uint8_t *metadata, + int64_t metadata_size, + int64_t num_readers, + std::shared_ptr *data) { + return impl_->WriteAcquireMutableObject( + object_id, data_size, metadata, metadata_size, num_readers, data); +} + Status PlasmaClient::CreateAndSpillIfNeeded(const ObjectID &object_id, const ray::rpc::Address &owner_address, + bool is_mutable, int64_t data_size, const uint8_t *metadata, int64_t metadata_size, @@ -942,6 +986,7 @@ Status PlasmaClient::CreateAndSpillIfNeeded(const ObjectID &object_id, int device_num) { return impl_->CreateAndSpillIfNeeded(object_id, owner_address, + is_mutable, data_size, metadata, metadata_size, @@ -989,9 +1034,7 @@ Status PlasmaClient::Contains(const ObjectID &object_id, bool *has_object) { Status PlasmaClient::Abort(const ObjectID &object_id) { return impl_->Abort(object_id); } -Status PlasmaClient::Seal(const ObjectID &object_id, int64_t num_readers) { - return impl_->Seal(object_id, num_readers); -} +Status PlasmaClient::Seal(const ObjectID &object_id) { return impl_->Seal(object_id); } Status PlasmaClient::Delete(const ObjectID &object_id) { return impl_->Delete(std::vector{object_id}); diff --git a/src/ray/object_manager/plasma/client.h b/src/ray/object_manager/plasma/client.h index 74841df373fee..00c85cca3f11e 100644 --- a/src/ray/object_manager/plasma/client.h +++ b/src/ray/object_manager/plasma/client.h @@ -90,7 +90,7 @@ class PlasmaClientInterface { /// /// \param object_id The ID of the object to seal. /// \return The return status. - virtual Status Seal(const ObjectID &object_id, int64_t num_readers = -1) = 0; + virtual Status Seal(const ObjectID &object_id) = 0; /// Abort an unsealed object in the object store. If the abort succeeds, then /// it will be as if the object was never created at all. The unsealed object @@ -129,6 +129,7 @@ class PlasmaClientInterface { /// be either sealed or aborted. virtual Status CreateAndSpillIfNeeded(const ObjectID &object_id, const ray::rpc::Address &owner_address, + bool is_mutable, int64_t data_size, const uint8_t *metadata, int64_t metadata_size, @@ -136,6 +137,13 @@ class PlasmaClientInterface { plasma::flatbuf::ObjectSource source, int device_num = 0) = 0; + virtual Status WriteAcquireMutableObject(const ObjectID &object_id, + int64_t data_size, + const uint8_t *metadata, + int64_t metadata_size, + int64_t num_readers, + std::shared_ptr *data) = 0; + /// Delete a list of objects from the object store. This currently assumes that the /// object is present, has been sealed and not used by another client. Otherwise, /// it is a no operation. @@ -195,6 +203,7 @@ class PlasmaClient : public PlasmaClientInterface { /// be either sealed or aborted. Status CreateAndSpillIfNeeded(const ObjectID &object_id, const ray::rpc::Address &owner_address, + bool is_mutable, int64_t data_size, const uint8_t *metadata, int64_t metadata_size, @@ -202,6 +211,13 @@ class PlasmaClient : public PlasmaClientInterface { plasma::flatbuf::ObjectSource source, int device_num = 0); + Status WriteAcquireMutableObject(const ObjectID &object_id, + int64_t data_size, + const uint8_t *metadata, + int64_t metadata_size, + int64_t num_readers, + std::shared_ptr *data); + /// Create an object in the Plasma Store. Any metadata for this object must be /// be passed in when the object is created. /// @@ -294,7 +310,7 @@ class PlasmaClient : public PlasmaClientInterface { /// /// \param object_id The ID of the object to seal. /// \return The return status. - Status Seal(const ObjectID &object_id, int64_t num_readers = -1); + Status Seal(const ObjectID &object_id); /// Delete an object from the object store. This currently assumes that the /// object is present, has been sealed and not used by another client. Otherwise, diff --git a/src/ray/object_manager/plasma/object_store.cc b/src/ray/object_manager/plasma/object_store.cc index 260951f178567..7d60c3ff1394a 100644 --- a/src/ray/object_manager/plasma/object_store.cc +++ b/src/ray/object_manager/plasma/object_store.cc @@ -70,6 +70,13 @@ const LocalObject *ObjectStore::SealObject(const ObjectID &object_id) { } entry->state = ObjectState::PLASMA_SEALED; entry->construct_duration = std::time(nullptr) - entry->create_time; + auto plasma_header = entry->GetPlasmaObjectHeader(); + if (entry->object_info.is_mutable) { + // Register the sealed object before allowing the writer to write. + plasma_header->ReadRelease(/*read_version=*/1); + } else { + RAY_CHECK(plasma_header->num_readers == -1) << plasma_header->num_readers; + } return entry; } diff --git a/src/ray/object_manager/plasma/plasma.fbs b/src/ray/object_manager/plasma/plasma.fbs index 68177465f3366..ba2df089c6032 100644 --- a/src/ray/object_manager/plasma/plasma.fbs +++ b/src/ray/object_manager/plasma/plasma.fbs @@ -129,6 +129,8 @@ table PlasmaCreateRequest { owner_port: int; // Unique id for the owner worker. owner_worker_id: string; + // Whether the object will be mutable. + is_mutable: bool; // The size of the object's data in bytes. data_size: ulong; // The size of the object's metadata in bytes. diff --git a/src/ray/object_manager/plasma/protocol.cc b/src/ray/object_manager/plasma/protocol.cc index 79b9a27827fb1..50f1f60d332ec 100644 --- a/src/ray/object_manager/plasma/protocol.cc +++ b/src/ray/object_manager/plasma/protocol.cc @@ -200,6 +200,7 @@ Status SendCreateRetryRequest(const std::shared_ptr &store_conn, Status SendCreateRequest(const std::shared_ptr &store_conn, ObjectID object_id, const ray::rpc::Address &owner_address, + bool is_mutable, int64_t data_size, int64_t metadata_size, flatbuf::ObjectSource source, @@ -213,6 +214,7 @@ Status SendCreateRequest(const std::shared_ptr &store_conn, fbb.CreateString(owner_address.ip_address()), owner_address.port(), fbb.CreateString(owner_address.worker_id()), + is_mutable, data_size, metadata_size, source, @@ -229,6 +231,7 @@ void ReadCreateRequest(uint8_t *data, RAY_DCHECK(data); auto message = flatbuffers::GetRoot(data); RAY_DCHECK(VerifyFlatbuffer(message, data, size)); + object_info->is_mutable = message->is_mutable(); object_info->data_size = message->data_size(); object_info->metadata_size = message->metadata_size(); object_info->object_id = ObjectID::FromBinary(message->object_id()->str()); diff --git a/src/ray/object_manager/plasma/protocol.h b/src/ray/object_manager/plasma/protocol.h index 7f4fcdd3ac589..23a120ac2ca05 100644 --- a/src/ray/object_manager/plasma/protocol.h +++ b/src/ray/object_manager/plasma/protocol.h @@ -85,6 +85,7 @@ Status SendCreateRetryRequest(const std::shared_ptr &store_conn, Status SendCreateRequest(const std::shared_ptr &store_conn, ObjectID object_id, const ray::rpc::Address &owner_address, + bool is_mutable, int64_t data_size, int64_t metadata_size, flatbuf::ObjectSource source, diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc index e948e885aecfa..c0aa1319b193f 100644 --- a/src/ray/object_manager/plasma/store.cc +++ b/src/ray/object_manager/plasma/store.cc @@ -540,50 +540,32 @@ void PlasmaStore::WaitForSeal(const ObjectID &object_id, RAY_CHECK(entry); auto plasma_header = entry->GetPlasmaObjectHeader(); - int event_fd = eventfd(0, EFD_CLOEXEC); - RAY_CHECK(event_fd != -1); + auto seal_signal = std::make_shared(io_context_); + seal_signal->expires_at(boost::posix_time::pos_infin); - auto wait_fn = [event_fd, plasma_header]() { + auto wait_fn = [this, seal_signal, plasma_header]() { plasma_header->ReadAcquire(/*read_version=*/1); - uint64_t data = 1; - auto num_bytes_written = write(event_fd, &data, sizeof(data)); - // TODO(swang): Need proper error checking here. - if (num_bytes_written != sizeof(data)) { - RAY_LOG(WARNING) << num_bytes_written << " bytes written on fd " << event_fd - << " err: " << strerror(errno); + { + absl::MutexLock lock(&seal_deadline_timer_mutex_); + seal_signal->cancel(); } }; auto wait_thread = std::make_shared(wait_fn); - boost::asio::spawn( - io_context_, - [this, event_fd, object_id, plasma_header, wait_thread, client]( - boost::asio::yield_context yield) { - auto event_stream = std::make_shared( - io_context_, event_fd); - auto data = std::make_shared(0); - auto buf = boost::asio::buffer(data.get(), sizeof(*data)); - boost::asio::async_read( - *event_stream, - buf, - [this, event_stream, data, object_id, event_fd, wait_thread]( - const boost::system::error_code &ec, size_t bytes_transferred) { - RAY_CHECK(bytes_transferred == sizeof(*data)) << ec.message(); - - // RAY_CHECK(plasma_header->num_readers == -1) << - // plasma_header->num_readers; - - { - absl::MutexLock lock(&mutex_); - SealObjects({object_id}); - } - - wait_thread->join(); - close(event_fd); - }); - }); + { + absl::MutexLock lock(&seal_deadline_timer_mutex_); + seal_signal->async_wait([this, object_id, plasma_header, wait_thread, client]( + const boost::system::error_code &ec) { + { + absl::MutexLock lock(&mutex_); + SealObjects({object_id}); + } + + wait_thread->join(); + }); + } } int64_t PlasmaStore::GetConsumedBytes() { return total_consumed_bytes_; } diff --git a/src/ray/object_manager/plasma/store.h b/src/ray/object_manager/plasma/store.h index 74c33edb9a2c6..3a4b457a6381b 100644 --- a/src/ray/object_manager/plasma/store.h +++ b/src/ray/object_manager/plasma/store.h @@ -309,6 +309,8 @@ class PlasmaStore { bool dumped_on_oom_ ABSL_GUARDED_BY(mutex_) = false; GetRequestQueue get_request_queue_ ABSL_GUARDED_BY(mutex_); + + absl::Mutex seal_deadline_timer_mutex_; }; } // namespace plasma From c2dbf1f8cf8a9e0c444a06af1817f956aaf9a9b5 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 29 Nov 2023 17:09:57 -0800 Subject: [PATCH 07/66] feature flag for shared mem seal, only acquire once per ray.get Signed-off-by: Stephanie Wang --- src/ray/common/ray_config_def.h | 4 + src/ray/object_manager/plasma/client.cc | 110 +++++++++++++++--------- src/ray/object_manager/plasma/store.cc | 7 +- 3 files changed, 78 insertions(+), 43 deletions(-) diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index d15d9fbffc7dd..ba6825b066954 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -886,3 +886,7 @@ RAY_CONFIG(bool, enable_autoscaler_v2, false) // Python GCS client number of reconnection retry and timeout. RAY_CONFIG(int64_t, nums_py_gcs_reconnect_retry, 5) RAY_CONFIG(int64_t, py_gcs_connect_timeout_s, 30) + +// Feature flag for whether to use shared-memory based synchronization to +// implement plasma object Seal. The current method is to instead use IPC. +RAY_CONFIG(bool, plasma_use_shared_memory_seal, false) diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index 57c62bf486df9..95f5f8489c5c4 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -97,9 +97,17 @@ struct ObjectInUseEntry { bool is_sealed; bool is_mutable = false; /// For shared objects only. - /// The last version that we read or wrote. To read or write again, we must - /// pass a newer version than this. + /// The last version that we read. To read again, we must pass a newer + /// version than this. int64_t next_version_to_read = 1; + /// Whether we currently have a read lock on the object. If this is true, + /// then it is safe to read the value of the object. For immutable objects, + /// this will always be true once the object has been sealed. For immutable + /// objects, ReadRelease resets this to false, and ReadAcquire resets to + /// true. + bool read_acquired = false; + /// The last version that we wrote. To write again, we must pass a newer + /// version than this. int64_t next_version_to_write = 1; }; @@ -160,6 +168,9 @@ class PlasmaClient::Impl : public std::enable_shared_from_this &object_entry); + Status GetRelease(const ObjectID &object_id); Status Release(const ObjectID &object_id); @@ -562,18 +573,10 @@ Status PlasmaClient::Impl::GetBuffers( << "Attempting to get an object that this client created but hasn't sealed."; all_present = false; } else { - PlasmaObject *object = &object_entry->second->object; - // Wait for the object to become ready to read. - auto plasma_header = GetPlasmaObjectHeader(*object); - int64_t version_read = - plasma_header->ReadAcquire(object_entry->second->next_version_to_read); - auto data_size = plasma_header->GetDataSize(); - RAY_LOG(DEBUG) << "SANG-TODO data size is " << data_size; - if (version_read > 0) { - object_entry->second->is_mutable = true; - object_entry->second->next_version_to_read = version_read; - } + auto plasma_header = EnsureGetAcquired(object_entry->second); + + PlasmaObject *object = &object_entry->second->object; std::shared_ptr physical_buf; if (object->device_num == 0) { @@ -584,6 +587,7 @@ Status PlasmaClient::Impl::GetBuffers( RAY_LOG(FATAL) << "GPU library is not enabled."; } physical_buf = wrap_buffer(object_ids[i], physical_buf); + auto data_size = plasma_header->GetDataSize(); object_buffers[i].data = SharedMemoryBuffer::Slice(physical_buf, 0, data_size); object_buffers[i].metadata = SharedMemoryBuffer::Slice( physical_buf, object->data_size, object->metadata_size); @@ -645,15 +649,11 @@ Status PlasmaClient::Impl::GetBuffers( // client is using. Cache the reference to the object. IncrementObjectCount(received_object_ids[i], object, true); auto &object_entry = objects_in_use_[received_object_ids[i]]; + // Wait for the object to become ready to read. - auto plasma_header = GetPlasmaObjectHeader(*object); - int64_t version_read = plasma_header->ReadAcquire(/*version=*/1); + RAY_CHECK(!object_entry->read_acquired); + auto plasma_header = EnsureGetAcquired(object_entry); auto data_size = plasma_header->GetDataSize(); - if (version_read > 0) { - object_entry->is_mutable = true; - object_entry->next_version_to_read = version_read; - } - std::shared_ptr physical_buf; if (object->device_num == 0) { uint8_t *data = LookupMmappedFile(object->store_fd); @@ -694,9 +694,27 @@ Status PlasmaClient::Impl::Get(const std::vector &object_ids, &object_ids[0], num_objects, timeout_ms, wrap_buffer, &(*out)[0], is_from_worker); } +ray::PlasmaObjectHeader *PlasmaClient::Impl::EnsureGetAcquired( + std::unique_ptr &object_entry) { + PlasmaObject *object = &object_entry->object; + auto plasma_header = GetPlasmaObjectHeader(*object); + if (object_entry->read_acquired) { + return plasma_header; + } + + int64_t version_read = plasma_header->ReadAcquire(object_entry->next_version_to_read); + object_entry->read_acquired = true; + if (version_read > 0) { + object_entry->is_mutable = true; + object_entry->next_version_to_read = version_read; + } + return plasma_header; +} + Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) { RAY_LOG(DEBUG) << "Try to release Get for object " << object_id; std::unique_lock guard(client_mutex_); + auto object_entry = objects_in_use_.find(object_id); if (object_entry == objects_in_use_.end()) { return Status::ObjectNotFound( @@ -712,8 +730,8 @@ Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) { "ray.release() called on an object that is not mutable"); } + auto plasma_header = EnsureGetAcquired(entry); RAY_LOG(DEBUG) << "Release shared object " << object_id; - auto plasma_header = GetPlasmaObjectHeader(entry->object); plasma_header->ReadRelease(entry->next_version_to_read); // The next read needs to read at least this version. entry->next_version_to_read++; @@ -741,11 +759,14 @@ Status PlasmaClient::Impl::Release(const ObjectID &object_id) { const auto object_entry = objects_in_use_.find(object_id); RAY_CHECK(object_entry != objects_in_use_.end()); - object_entry->second->count -= 1; - RAY_CHECK(object_entry->second->count >= 0); - // Check if the client is no longer using this object. - // TODO(swang): Nicer way to pin shared objects. - if (object_entry->second->count == 0 && !object_entry->second->is_mutable) { + if (!object_entry->second->is_mutable) { + // Release only applies to immutable objects. + // TODO(swang): Add a delete call to properly clean up mutable objects. + object_entry->second->count -= 1; + RAY_CHECK(object_entry->second->count >= 0); + } + + if (object_entry->second->count == 0) { // object_entry is invalidated in MarkObjectUnused, need to read the fd beforehand. MEMFD_TYPE fd = object_entry->second->object.store_fd; // Tell the store that the client no longer needs the object. @@ -821,22 +842,27 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) { // The next Write must pass a higher version. object_entry->second->next_version_to_write++; object_entry->second->is_sealed = true; - //// Send the seal request to Plasma. - // RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id)); - // std::vector buffer; - // RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer)); - // ObjectID sealed_id; - // RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id)); - // RAY_CHECK(sealed_id == object_id); - //// We call PlasmaClient::Release to decrement the number of instances of this - //// object - //// that are currently being used by this client. The corresponding increment - //// happened in plasma_create and was used to ensure that the object was not - //// released before the call to PlasmaClient::Seal. - // return Release(object_id); - - // TODO(swang): Release the object if the ref count == 0. - return Status::OK(); + + if (RayConfig::instance().plasma_use_shared_memory_seal()) { + // If using shared-memory based Seal, then we don't need to do anything + // further because the object store will learn that the object has been + // sealed when ReadAcquire returns. + return Status::OK(); + } + + /// Send the seal request to Plasma. + RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id)); + std::vector buffer; + RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer)); + ObjectID sealed_id; + RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id)); + RAY_CHECK(sealed_id == object_id); + // We call PlasmaClient::Release to decrement the number of instances of this + // object + // that are currently being used by this client. The corresponding increment + // happened in plasma_create and was used to ensure that the object was not + // released before the call to PlasmaClient::Seal. + return Release(object_id); } Status PlasmaClient::Impl::Abort(const ObjectID &object_id) { diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc index c0aa1319b193f..c6b80160511c8 100644 --- a/src/ray/object_manager/plasma/store.cc +++ b/src/ray/object_manager/plasma/store.cc @@ -528,7 +528,9 @@ void PlasmaStore::ReplyToCreateClient(const std::shared_ptr &client, static_cast(client->SendFd(result.store_fd)); } - WaitForSeal(object_id, client); + if (RayConfig::instance().plasma_use_shared_memory_seal()) { + WaitForSeal(object_id, client); + } } else { static_cast(SendUnfinishedCreateReply(client, object_id, req_id)); } @@ -540,6 +542,9 @@ void PlasmaStore::WaitForSeal(const ObjectID &object_id, RAY_CHECK(entry); auto plasma_header = entry->GetPlasmaObjectHeader(); + // Read acquire is blocking, so put it on a background thread and use an + // async timer as a signal. The main thread is signaled when the timer is + // cancelled. auto seal_signal = std::make_shared(io_context_); seal_signal->expires_at(boost::posix_time::pos_infin); From 6d4aa943ebd8b77429a4e54857e4012ef40ab2c3 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 29 Nov 2023 17:35:00 -0800 Subject: [PATCH 08/66] put-get Signed-off-by: Stephanie Wang --- python/ray/__init__.py | 2 +- python/ray/_private/worker.py | 5 ++++- python/ray/tests/test_accelerated_dag.py | 16 ++++++++++++---- src/ray/object_manager/plasma/client.cc | 1 + 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/python/ray/__init__.py b/python/ray/__init__.py index d95d2f5a20c5b..fd32de31e5463 100644 --- a/python/ray/__init__.py +++ b/python/ray/__init__.py @@ -116,11 +116,11 @@ def _configure_system(): SPILL_WORKER_MODE, _create_mutable_object, _put_mutable_object, + _release_mutable_object, cancel, get, get_actor, get_gpu_ids, - release, init, is_initialized, put, diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py index 4e5b08f6dedb1..31c8a9fae2276 100644 --- a/python/ray/_private/worker.py +++ b/python/ray/_private/worker.py @@ -2498,7 +2498,7 @@ def show_in_dashboard(message: str, key: str = "", dtype: str = "text"): blocking_get_inside_async_warned = False -def release(object_ref): +def _release_mutable_object(object_ref): worker = global_worker worker.check_connected() worker.core_worker.get_release([object_ref]) @@ -2640,6 +2640,9 @@ def _put_mutable_object(value: Any, object_ref: ObjectRef, num_readers: int): worker = global_worker worker.check_connected() + if num_readers <= 0: + raise ValueError("``num_readers`` must be a positive integer.") + try: serialized_value = worker.get_serialization_context().serialize(value) except TypeError as e: diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py index 405d78d46cc9b..4622378349acc 100644 --- a/python/ray/tests/test_accelerated_dag.py +++ b/python/ray/tests/test_accelerated_dag.py @@ -11,11 +11,19 @@ logger = logging.getLogger(__name__) -def test_put_mutable_object(ray_start_cluster): - ray.init() +def test_put_get(ray_start_cluster): + ray.init( + _system_config={ + "plasma_use_shared_memory_seal": True, + } + ) ref = ray._create_mutable_object(1000) - ray._put_mutable_object(b"hello", ref, num_readers=1) - assert ray.get(ref) == b"hello" + + for i in range(100): + val = i.to_bytes(8, "little") + ray._put_mutable_object(val, ref, num_readers=1) + assert ray.get(ref) == val + ray._release_mutable_object(ref) if __name__ == "__main__": diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index 95f5f8489c5c4..f8148f01eaf84 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -735,6 +735,7 @@ Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) { plasma_header->ReadRelease(entry->next_version_to_read); // The next read needs to read at least this version. entry->next_version_to_read++; + entry->read_acquired = false; return Status::OK(); } From bc4f1e9b98b7a8d757725836dfa4b3bba81d5562 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 29 Nov 2023 17:44:55 -0800 Subject: [PATCH 09/66] rm shared mem seal Signed-off-by: Stephanie Wang --- python/ray/tests/test_accelerated_dag.py | 7 +--- src/ray/common/ray_config_def.h | 4 -- src/ray/object_manager/common.cc | 1 - src/ray/object_manager/plasma/client.cc | 12 +----- src/ray/object_manager/plasma/object_store.cc | 5 +-- src/ray/object_manager/plasma/store.cc | 41 ------------------- src/ray/object_manager/plasma/store.h | 4 -- 7 files changed, 5 insertions(+), 69 deletions(-) diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py index 4622378349acc..ebc915255c7e7 100644 --- a/python/ray/tests/test_accelerated_dag.py +++ b/python/ray/tests/test_accelerated_dag.py @@ -12,11 +12,8 @@ def test_put_get(ray_start_cluster): - ray.init( - _system_config={ - "plasma_use_shared_memory_seal": True, - } - ) + ray.init() + ref = ray._create_mutable_object(1000) for i in range(100): diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index ba6825b066954..d15d9fbffc7dd 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -886,7 +886,3 @@ RAY_CONFIG(bool, enable_autoscaler_v2, false) // Python GCS client number of reconnection retry and timeout. RAY_CONFIG(int64_t, nums_py_gcs_reconnect_retry, 5) RAY_CONFIG(int64_t, py_gcs_connect_timeout_s, 30) - -// Feature flag for whether to use shared-memory based synchronization to -// implement plasma object Seal. The current method is to instead use IPC. -RAY_CONFIG(bool, plasma_use_shared_memory_seal, false) diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc index 24e906d9ba4d5..34194f249d234 100644 --- a/src/ray/object_manager/common.cc +++ b/src/ray/object_manager/common.cc @@ -78,7 +78,6 @@ void PlasmaObjectHeader::WriteRelease(int64_t write_version) { << version << ". Are you sure this is the only writer?"; version = write_version; - RAY_CHECK(num_readers != 0); num_read_acquires_remaining = num_readers; num_read_releases_remaining = num_readers; diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index f8148f01eaf84..46f6d4ac286f2 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -501,9 +501,8 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, plasma_header->WriteAcquire(/*next_version_to_write*/ entry->next_version_to_write, entry->object.data_size); if (entry->is_mutable) { - // The plasma store is the first reader. Once it read-releases, the - // writer may write an actual value. - plasma_header->num_readers = 1; + // When the object is first created, it is in writeable state. + plasma_header->num_readers = 0; } else { // Anyone may read. plasma_header->num_readers = -1; @@ -844,13 +843,6 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) { object_entry->second->next_version_to_write++; object_entry->second->is_sealed = true; - if (RayConfig::instance().plasma_use_shared_memory_seal()) { - // If using shared-memory based Seal, then we don't need to do anything - // further because the object store will learn that the object has been - // sealed when ReadAcquire returns. - return Status::OK(); - } - /// Send the seal request to Plasma. RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id)); std::vector buffer; diff --git a/src/ray/object_manager/plasma/object_store.cc b/src/ray/object_manager/plasma/object_store.cc index 7d60c3ff1394a..8f4178dc9b797 100644 --- a/src/ray/object_manager/plasma/object_store.cc +++ b/src/ray/object_manager/plasma/object_store.cc @@ -71,10 +71,7 @@ const LocalObject *ObjectStore::SealObject(const ObjectID &object_id) { entry->state = ObjectState::PLASMA_SEALED; entry->construct_duration = std::time(nullptr) - entry->create_time; auto plasma_header = entry->GetPlasmaObjectHeader(); - if (entry->object_info.is_mutable) { - // Register the sealed object before allowing the writer to write. - plasma_header->ReadRelease(/*read_version=*/1); - } else { + if (!entry->object_info.is_mutable) { RAY_CHECK(plasma_header->num_readers == -1) << plasma_header->num_readers; } return entry; diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc index c6b80160511c8..2f96cf139d5b9 100644 --- a/src/ray/object_manager/plasma/store.cc +++ b/src/ray/object_manager/plasma/store.cc @@ -527,52 +527,11 @@ void PlasmaStore::ReplyToCreateClient(const std::shared_ptr &client, error == PlasmaError::OK && result.device_num == 0) { static_cast(client->SendFd(result.store_fd)); } - - if (RayConfig::instance().plasma_use_shared_memory_seal()) { - WaitForSeal(object_id, client); - } } else { static_cast(SendUnfinishedCreateReply(client, object_id, req_id)); } } -void PlasmaStore::WaitForSeal(const ObjectID &object_id, - const std::shared_ptr &client) { - auto entry = object_lifecycle_mgr_.GetObject(object_id); - RAY_CHECK(entry); - auto plasma_header = entry->GetPlasmaObjectHeader(); - - // Read acquire is blocking, so put it on a background thread and use an - // async timer as a signal. The main thread is signaled when the timer is - // cancelled. - auto seal_signal = std::make_shared(io_context_); - seal_signal->expires_at(boost::posix_time::pos_infin); - - auto wait_fn = [this, seal_signal, plasma_header]() { - plasma_header->ReadAcquire(/*read_version=*/1); - - { - absl::MutexLock lock(&seal_deadline_timer_mutex_); - seal_signal->cancel(); - } - }; - - auto wait_thread = std::make_shared(wait_fn); - - { - absl::MutexLock lock(&seal_deadline_timer_mutex_); - seal_signal->async_wait([this, object_id, plasma_header, wait_thread, client]( - const boost::system::error_code &ec) { - { - absl::MutexLock lock(&mutex_); - SealObjects({object_id}); - } - - wait_thread->join(); - }); - } -} - int64_t PlasmaStore::GetConsumedBytes() { return total_consumed_bytes_; } bool PlasmaStore::IsObjectSpillable(const ObjectID &object_id) { diff --git a/src/ray/object_manager/plasma/store.h b/src/ray/object_manager/plasma/store.h index 3a4b457a6381b..a6c992c131280 100644 --- a/src/ray/object_manager/plasma/store.h +++ b/src/ray/object_manager/plasma/store.h @@ -118,8 +118,6 @@ class PlasmaStore { return available; } - void WaitForSeal(const ObjectID &object_id, const std::shared_ptr &client); - private: /// Create a new object. The client must do a call to release_object to tell /// the store when it is done with the object. @@ -309,8 +307,6 @@ class PlasmaStore { bool dumped_on_oom_ ABSL_GUARDED_BY(mutex_) = false; GetRequestQueue get_request_queue_ ABSL_GUARDED_BY(mutex_); - - absl::Mutex seal_deadline_timer_mutex_; }; } // namespace plasma From c4a2378baf57535a988f4b389155b5266d9f3fba Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 30 Nov 2023 08:47:15 -0800 Subject: [PATCH 10/66] fix num_readers on first version, unit tests pass now Signed-off-by: Stephanie Wang --- python/ray/_private/worker.py | 6 ++- python/ray/tests/test_accelerated_dag.py | 32 +++++++++++-- src/ray/object_manager/common.cc | 1 + src/ray/object_manager/plasma/client.cc | 59 ++++++++++++------------ src/ray/object_manager/plasma/store.cc | 2 - 5 files changed, 63 insertions(+), 37 deletions(-) diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py index 31c8a9fae2276..d9ed7a72e94c5 100644 --- a/python/ray/_private/worker.py +++ b/python/ray/_private/worker.py @@ -2498,10 +2498,12 @@ def show_in_dashboard(message: str, key: str = "", dtype: str = "text"): blocking_get_inside_async_warned = False -def _release_mutable_object(object_ref): +def _release_mutable_object(object_refs): worker = global_worker worker.check_connected() - worker.core_worker.get_release([object_ref]) + if isinstance(object_refs, ObjectRef): + object_refs = [object_refs] + worker.core_worker.get_release(object_refs) @overload diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py index ebc915255c7e7..4adc04bb1e51b 100644 --- a/python/ray/tests/test_accelerated_dag.py +++ b/python/ray/tests/test_accelerated_dag.py @@ -11,18 +11,42 @@ logger = logging.getLogger(__name__) -def test_put_get(ray_start_cluster): - ray.init() - +def test_put_local_get(ray_start_regular): ref = ray._create_mutable_object(1000) - for i in range(100): + num_writes = 1000 + for i in range(num_writes): val = i.to_bytes(8, "little") ray._put_mutable_object(val, ref, num_readers=1) assert ray.get(ref) == val ray._release_mutable_object(ref) +@pytest.mark.parametrize("num_readers", [1, 4]) +def test_put_remote_get(ray_start_regular, num_readers): + ref = ray._create_mutable_object(1000) + + @ray.remote(num_cpus=0) + class Reader: + def __init__(self): + pass + + def read(self, ref, num_writes): + for i in range(num_writes): + val = i.to_bytes(8, "little") + assert ray.get(ref[0]) == val + ray._release_mutable_object(ref) + + num_writes = 1000 + readers = [Reader.remote() for _ in range(num_readers)] + done = [reader.read.remote([ref], num_writes) for reader in readers] + for i in range(num_writes): + val = i.to_bytes(8, "little") + ray._put_mutable_object(val, ref, num_readers=num_readers) + + ray.get(done) + + if __name__ == "__main__": if os.environ.get("PARALLEL_CI"): sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__])) diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc index 34194f249d234..bd1cc168481c8 100644 --- a/src/ray/object_manager/common.cc +++ b/src/ray/object_manager/common.cc @@ -78,6 +78,7 @@ void PlasmaObjectHeader::WriteRelease(int64_t write_version) { << version << ". Are you sure this is the only writer?"; version = write_version; + RAY_CHECK(num_readers != 0) << num_readers; num_read_acquires_remaining = num_readers; num_read_releases_remaining = num_readers; diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index 46f6d4ac286f2..b25d9d28853b5 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -493,17 +493,13 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, entry->is_mutable = is_mutable; auto plasma_header = GetPlasmaObjectHeader(entry->object); - // The corresponding WriteRelease takes place in Seal. - // When an object is first created, the data size is equivalent to - // buffer size. - // The first creation's version is always 1. - RAY_CHECK(entry->next_version_to_write == 1); - plasma_header->WriteAcquire(/*next_version_to_write*/ entry->next_version_to_write, - entry->object.data_size); - if (entry->is_mutable) { - // When the object is first created, it is in writeable state. - plasma_header->num_readers = 0; - } else { + if (!entry->is_mutable) { + // The first creation's version is always 1. + RAY_CHECK(entry->next_version_to_write == 1); + // The corresponding WriteRelease takes place in Seal. + // When an object is first created, the data size is equivalent to + // buffer size. + plasma_header->WriteAcquire(entry->next_version_to_write, data_size); // Anyone may read. plasma_header->num_readers = -1; } @@ -836,26 +832,31 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) { return Status::ObjectAlreadySealed("Seal() called on an already sealed object"); } - auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object); - plasma_header->WriteRelease( - /*write_version=*/object_entry->second->next_version_to_write); - // The next Write must pass a higher version. - object_entry->second->next_version_to_write++; object_entry->second->is_sealed = true; + auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object); + if (plasma_header->num_readers != 0) { + plasma_header->WriteRelease( + /*write_version=*/object_entry->second->next_version_to_write); + // The next Write must pass a higher version. + object_entry->second->next_version_to_write++; + } else { + // Send the seal request to Plasma. This is the normal Seal path, used for + // immutable objects and the initial Create call for mutable objects. + RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id)); + std::vector buffer; + RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer)); + ObjectID sealed_id; + RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id)); + RAY_CHECK(sealed_id == object_id); + // We call PlasmaClient::Release to decrement the number of instances of this + // object + // that are currently being used by this client. The corresponding increment + // happened in plasma_create and was used to ensure that the object was not + // released before the call to PlasmaClient::Seal. + RAY_RETURN_NOT_OK(Release(object_id)); + } - /// Send the seal request to Plasma. - RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id)); - std::vector buffer; - RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer)); - ObjectID sealed_id; - RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id)); - RAY_CHECK(sealed_id == object_id); - // We call PlasmaClient::Release to decrement the number of instances of this - // object - // that are currently being used by this client. The corresponding increment - // happened in plasma_create and was used to ensure that the object was not - // released before the call to PlasmaClient::Seal. - return Release(object_id); + return Status::OK(); } Status PlasmaClient::Impl::Abort(const ObjectID &object_id) { diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc index 2f96cf139d5b9..66876de42cbcc 100644 --- a/src/ray/object_manager/plasma/store.cc +++ b/src/ray/object_manager/plasma/store.cc @@ -31,9 +31,7 @@ #include #include #include -#include -#include #include #include #include From e40d3c837b790c634baa4f416a6b42c228d3eb06 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 30 Nov 2023 13:26:48 -0800 Subject: [PATCH 11/66] mutable object -> channel Signed-off-by: Stephanie Wang --- python/ray/__init__.py | 6 +++--- python/ray/_private/worker.py | 12 +++++++++--- python/ray/tests/test_accelerated_dag.py | 12 ++++++------ 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/python/ray/__init__.py b/python/ray/__init__.py index fd32de31e5463..95ac20ceb15fb 100644 --- a/python/ray/__init__.py +++ b/python/ray/__init__.py @@ -114,9 +114,9 @@ def _configure_system(): WORKER_MODE, RESTORE_WORKER_MODE, SPILL_WORKER_MODE, - _create_mutable_object, - _put_mutable_object, - _release_mutable_object, + _create_channel, + _write_channel, + _end_read_channel, cancel, get, get_actor, diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py index d9ed7a72e94c5..fa93f8ea91851 100644 --- a/python/ray/_private/worker.py +++ b/python/ray/_private/worker.py @@ -2498,7 +2498,13 @@ def show_in_dashboard(message: str, key: str = "", dtype: str = "text"): blocking_get_inside_async_warned = False -def _release_mutable_object(object_refs): +def _end_read_channel(object_refs): + """ + Signal to the writer that the channel is ready to write again. The read + begins when the caller calls ray.get and a written value is available. If + ray.get is not called first, then this call will block until a value is + written, then drop the value. + """ worker = global_worker worker.check_connected() if isinstance(object_refs, ObjectRef): @@ -2638,7 +2644,7 @@ def get( @PublicAPI -def _put_mutable_object(value: Any, object_ref: ObjectRef, num_readers: int): +def _write_channel(value: Any, object_ref: ObjectRef, num_readers: int): worker = global_worker worker.check_connected() @@ -2663,7 +2669,7 @@ def _put_mutable_object(value: Any, object_ref: ObjectRef, num_readers: int): @PublicAPI -def _create_mutable_object( +def _create_channel( buffer_size: int, ) -> "ray.ObjectRef": worker = global_worker diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py index 4adc04bb1e51b..c836bfd856450 100644 --- a/python/ray/tests/test_accelerated_dag.py +++ b/python/ray/tests/test_accelerated_dag.py @@ -12,19 +12,19 @@ def test_put_local_get(ray_start_regular): - ref = ray._create_mutable_object(1000) + ref = ray._create_channel(1000) num_writes = 1000 for i in range(num_writes): val = i.to_bytes(8, "little") - ray._put_mutable_object(val, ref, num_readers=1) + ray._write_channel(val, ref, num_readers=1) assert ray.get(ref) == val - ray._release_mutable_object(ref) + ray._end_read_channel(ref) @pytest.mark.parametrize("num_readers", [1, 4]) def test_put_remote_get(ray_start_regular, num_readers): - ref = ray._create_mutable_object(1000) + ref = ray._create_channel(1000) @ray.remote(num_cpus=0) class Reader: @@ -35,14 +35,14 @@ def read(self, ref, num_writes): for i in range(num_writes): val = i.to_bytes(8, "little") assert ray.get(ref[0]) == val - ray._release_mutable_object(ref) + ray._end_read_channel(ref) num_writes = 1000 readers = [Reader.remote() for _ in range(num_readers)] done = [reader.read.remote([ref], num_writes) for reader in readers] for i in range(num_writes): val = i.to_bytes(8, "little") - ray._put_mutable_object(val, ref, num_readers=num_readers) + ray._write_channel(val, ref, num_readers=num_readers) ray.get(done) From b79b7d1ea9c77166e63022a6abe033635b5b8abb Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 30 Nov 2023 15:25:11 -0800 Subject: [PATCH 12/66] micro Signed-off-by: Stephanie Wang --- python/ray/_private/ray_perf.py | 83 ++++++++++++++++++++++++++++++++ src/ray/object_manager/common.cc | 2 +- 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py index 316f3baeca846..1d07c49aea89f 100644 --- a/python/ray/_private/ray_perf.py +++ b/python/ray/_private/ray_perf.py @@ -288,6 +288,89 @@ def async_actor_multi(): results += timeit("n:n async-actor calls async", async_actor_multi, m * n) ray.shutdown() + ################################################# + # Perf tests for channels, used in compiled DAGs. + ################################################# + + ray.init() + + def put_channel_small(chans, num_readers=1, do_get=False, do_release=False): + for chan in chans: + ray._write_channel(b"0", chan, num_readers=num_readers) + if do_get: + ray.get(chan) + if do_release: + ray._end_read_channel(chan) + + @ray.remote + class ChannelReader: + def ready(self): + return + + def read(self, chans): + while True: + for chan in chans: + ray.get(chan) + ray._end_read_channel(chan) + + chans = [ray._create_channel(1000)] + results += timeit( + "local put, single channel calls", + lambda: put_channel_small(chans, do_release=True), + ) + results += timeit( + "local put:local get, single channel calls", + lambda: put_channel_small(chans, do_get=True, do_release=True), + ) + + chans = [ray._create_channel(1000)] + reader = ChannelReader.remote() + ray.get(reader.ready.remote()) + reader.read.remote(chans) + results += timeit( + "local put:1 remote get, single channel calls", lambda: put_channel_small(chans) + ) + ray.kill(reader) + + n_cpu = multiprocessing.cpu_count() // 2 + print(f"Testing multiple readers/channels, n={n_cpu}") + + chans = [ray._create_channel(1000)] + readers = [ChannelReader.remote() for _ in range(n_cpu)] + ray.get([reader.ready.remote() for reader in readers]) + for reader in readers: + reader.read.remote(chans) + results += timeit( + "local put:n remote get, single channel calls", + lambda: put_channel_small(chans, num_readers=n_cpu), + ) + for reader in readers: + ray.kill(reader) + + chans = [ray._create_channel(1000) for _ in range(n_cpu)] + reader = ChannelReader.remote() + ray.get(reader.ready.remote()) + reader.read.remote(chans) + results += timeit( + "local put:1 remote get, n channels calls", lambda: put_channel_small(chans) + ) + ray.kill(reader) + + chans = [ray._create_channel(1000) for _ in range(n_cpu)] + readers = [ChannelReader.remote() for _ in range(n_cpu)] + ray.get([reader.ready.remote() for reader in readers]) + for chan, reader in zip(chans, readers): + reader.read.remote([chan]) + results += timeit( + "local put:n remote get, n channels calls", lambda: put_channel_small(chans) + ) + for reader in readers: + ray.kill(reader) + + ############################ + # End of channel perf tests. + ############################ + NUM_PGS = 100 NUM_BUNDLES = 1 ray.init(resources={"custom": 100}) diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc index bd1cc168481c8..970fb9d096370 100644 --- a/src/ray/object_manager/common.cc +++ b/src/ray/object_manager/common.cc @@ -40,7 +40,7 @@ void PlasmaObjectHeader::Destroy() { // This has to be called only when reader lock is acquired // via ReadAcquire. uint64_t PlasmaObjectHeader::GetDataSize() const { - RAY_CHECK_GE(num_read_releases_remaining, 0) + RAY_CHECK_NE(num_read_releases_remaining, 0) << "ReadAcquire has to be called before calling this method."; return data_size; } From 5ea0fe3fd887303dfd831ec8d46e29e1d0ad671c Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 30 Nov 2023 17:07:50 -0800 Subject: [PATCH 13/66] support different metadata Signed-off-by: Stephanie Wang --- python/ray/tests/test_accelerated_dag.py | 18 ++++++ src/ray/object_manager/common.cc | 17 ++++-- src/ray/object_manager/common.h | 9 ++- src/ray/object_manager/plasma/client.cc | 68 +++++++++++++---------- src/ray/object_manager/plasma/common.h | 4 ++ src/ray/object_manager/plasma/plasma.fbs | 4 ++ src/ray/object_manager/plasma/plasma.h | 3 + src/ray/object_manager/plasma/protocol.cc | 4 ++ 8 files changed, 92 insertions(+), 35 deletions(-) diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py index c836bfd856450..e8ea4e7eb4a13 100644 --- a/python/ray/tests/test_accelerated_dag.py +++ b/python/ray/tests/test_accelerated_dag.py @@ -3,6 +3,7 @@ import os import sys +import numpy as np import pytest import ray @@ -22,6 +23,23 @@ def test_put_local_get(ray_start_regular): ray._end_read_channel(ref) +def test_put_different_meta(ray_start_regular): + ref = ray._create_channel(1000) + + def _test(val): + ray._write_channel(val, ref, num_readers=1) + if isinstance(val, np.ndarray): + assert np.array_equal(ray.get(ref), val) + else: + assert ray.get(ref) == val + ray._end_read_channel(ref) + + _test(b"hello") + _test("hello") + _test(1000) + _test(np.random.rand(10)) + + @pytest.mark.parametrize("num_readers", [1, 4]) def test_put_remote_get(ray_start_regular, num_readers): ref = ray._create_channel(1000) diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc index 970fb9d096370..7c95845012896 100644 --- a/src/ray/object_manager/common.cc +++ b/src/ray/object_manager/common.cc @@ -10,7 +10,8 @@ void PrintPlasmaObjectHeader(const PlasmaObjectHeader *header) { << "\n" << "num_read_releases_remaining: " << header->num_read_releases_remaining << "\n" - << "data_size: " << header->data_size << "\n"; + << "data_size: " << header->data_size << "\n" + << "metadata_size: " << header->metadata_size << "\n"; } void PlasmaObjectHeader::Init() { @@ -45,8 +46,13 @@ uint64_t PlasmaObjectHeader::GetDataSize() const { return data_size; } -void PlasmaObjectHeader::WriteAcquire(int64_t write_version, uint64_t new_size) { - RAY_LOG(DEBUG) << "WriteAcquire. version: " << write_version; +void PlasmaObjectHeader::WriteAcquire(int64_t write_version, + uint64_t write_data_size, + uint64_t write_metadata_size, + int64_t write_num_readers) { + RAY_LOG(DEBUG) << "WriteAcquire. version: " << write_version << ", data size " + << write_data_size << ", metadata size " << write_metadata_size + << ", num readers: " << write_num_readers; sem_wait(&rw_semaphore); RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0); PrintPlasmaObjectHeader(this); @@ -58,9 +64,10 @@ void PlasmaObjectHeader::WriteAcquire(int64_t write_version, uint64_t new_size) << " is more than 1 greater than current version " << version << ". Are you sure this is the only writer?"; - num_readers = 0; version = write_version; - data_size = new_size; + data_size = write_data_size; + metadata_size = write_metadata_size; + num_readers = write_num_readers; RAY_LOG(DEBUG) << "WriteAcquire done"; PrintPlasmaObjectHeader(this); diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h index 395c86ee8223b..31083942a928d 100644 --- a/src/ray/object_manager/common.h +++ b/src/ray/object_manager/common.h @@ -80,8 +80,13 @@ struct PlasmaObjectHeader { // Blocks until there are no more readers. // NOTE: Caller should ensure there is one writer at a time. /// \param write_version The new version for write. - /// \param new_size The new data size of the object. - void WriteAcquire(int64_t write_version, uint64_t new_data_size); + /// \param data_size The new data size of the object. + /// \param metadata_size The new metadata size of the object. + /// \param num_readers The number of readers for the object. + void WriteAcquire(int64_t write_version, + uint64_t data_size, + uint64_t metadata_size, + int64_t num_readers); // Call after completing a write to signal that readers may read. // num_readers should be set before calling this. diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index b25d9d28853b5..8a5637957a3ef 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -168,8 +168,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this &object_entry); + void EnsureGetAcquired(std::unique_ptr &object_entry); Status GetRelease(const ObjectID &object_id); @@ -417,18 +416,14 @@ Status PlasmaClient::Impl::WriteAcquireMutableObject(const ObjectID &object_id, // Wait for no readers. auto plasma_header = GetPlasmaObjectHeader(entry->object); - // NOTE: entry->object.data_size is the size of the data buffer. - // When the object is shared, we can have object size smaller than the data buffer. // TODO(swang): Better exception. - // TODO(swang): Support data size larger than allocated buffer. - RAY_CHECK(data_size <= entry->object.data_size) - << "Cannot write mutable data size " << data_size - << " larger than allocated buffer size " << entry->object.data_size; - // TODO(swang): Support different metadata size. - RAY_CHECK(metadata_size == entry->object.metadata_size) - << "Metadata size must stay the same"; - plasma_header->WriteAcquire(entry->next_version_to_write, data_size); - plasma_header->num_readers = num_readers; + // TODO(swang): Support data + metadata size larger than allocated buffer. + RAY_CHECK(data_size + metadata_size <= entry->object.allocated_size) + << "Cannot write mutable data size " << data_size << " + metadata size " + << metadata_size << " larger than allocated buffer size " + << entry->object.allocated_size; + plasma_header->WriteAcquire( + entry->next_version_to_write, data_size, metadata_size, num_readers); // Prepare the data buffer and return to the client instead of sending // the IPC to object store. @@ -439,8 +434,7 @@ Status PlasmaClient::Impl::WriteAcquireMutableObject(const ObjectID &object_id, data_size); if (metadata != NULL) { // Copy the metadata to the buffer. - memcpy( - (*data)->Data() + entry->object.data_size, metadata, entry->object.metadata_size); + memcpy((*data)->Data() + data_size, metadata, metadata_size); } entry->is_sealed = false; @@ -499,9 +493,11 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, // The corresponding WriteRelease takes place in Seal. // When an object is first created, the data size is equivalent to // buffer size. - plasma_header->WriteAcquire(entry->next_version_to_write, data_size); - // Anyone may read. - plasma_header->num_readers = -1; + plasma_header->WriteAcquire(entry->next_version_to_write, + data_size, + metadata_size, + // Anyone may read an immutable object. + /*num_readers=*/-1); } } @@ -569,11 +565,14 @@ Status PlasmaClient::Impl::GetBuffers( all_present = false; } else { // Wait for the object to become ready to read. - auto plasma_header = EnsureGetAcquired(object_entry->second); + EnsureGetAcquired(object_entry->second); PlasmaObject *object = &object_entry->second->object; std::shared_ptr physical_buf; + RAY_LOG(DEBUG) << "Plasma Get " << object_ids[i] + << ", data size: " << object->data_size + << ", metadata size: " << object->metadata_size; if (object->device_num == 0) { uint8_t *data = LookupMmappedFile(object->store_fd); physical_buf = std::make_shared( @@ -582,8 +581,8 @@ Status PlasmaClient::Impl::GetBuffers( RAY_LOG(FATAL) << "GPU library is not enabled."; } physical_buf = wrap_buffer(object_ids[i], physical_buf); - auto data_size = plasma_header->GetDataSize(); - object_buffers[i].data = SharedMemoryBuffer::Slice(physical_buf, 0, data_size); + object_buffers[i].data = + SharedMemoryBuffer::Slice(physical_buf, 0, object->data_size); object_buffers[i].metadata = SharedMemoryBuffer::Slice( physical_buf, object->data_size, object->metadata_size); object_buffers[i].device_num = object->device_num; @@ -647,9 +646,11 @@ Status PlasmaClient::Impl::GetBuffers( // Wait for the object to become ready to read. RAY_CHECK(!object_entry->read_acquired); - auto plasma_header = EnsureGetAcquired(object_entry); - auto data_size = plasma_header->GetDataSize(); + EnsureGetAcquired(object_entry); std::shared_ptr physical_buf; + RAY_LOG(DEBUG) << "Plasma Get " << received_object_ids[i] + << ", data size: " << object->data_size + << ", metadata size: " << object->metadata_size; if (object->device_num == 0) { uint8_t *data = LookupMmappedFile(object->store_fd); physical_buf = std::make_shared( @@ -659,7 +660,8 @@ Status PlasmaClient::Impl::GetBuffers( } // Finish filling out the return values. physical_buf = wrap_buffer(object_ids[i], physical_buf); - object_buffers[i].data = SharedMemoryBuffer::Slice(physical_buf, 0, data_size); + object_buffers[i].data = + SharedMemoryBuffer::Slice(physical_buf, 0, object->data_size); object_buffers[i].metadata = SharedMemoryBuffer::Slice( physical_buf, object->data_size, object->metadata_size); object_buffers[i].device_num = object->device_num; @@ -689,12 +691,12 @@ Status PlasmaClient::Impl::Get(const std::vector &object_ids, &object_ids[0], num_objects, timeout_ms, wrap_buffer, &(*out)[0], is_from_worker); } -ray::PlasmaObjectHeader *PlasmaClient::Impl::EnsureGetAcquired( +void PlasmaClient::Impl::EnsureGetAcquired( std::unique_ptr &object_entry) { PlasmaObject *object = &object_entry->object; auto plasma_header = GetPlasmaObjectHeader(*object); if (object_entry->read_acquired) { - return plasma_header; + return; } int64_t version_read = plasma_header->ReadAcquire(object_entry->next_version_to_read); @@ -702,8 +704,17 @@ ray::PlasmaObjectHeader *PlasmaClient::Impl::EnsureGetAcquired( if (version_read > 0) { object_entry->is_mutable = true; object_entry->next_version_to_read = version_read; + + // The data and metadata size may have changed, so update here before we + // create the Get buffer to return. + object_entry->object.data_size = plasma_header->data_size; + object_entry->object.metadata_size = plasma_header->metadata_size; + object_entry->object.metadata_offset = + object_entry->object.data_offset + object_entry->object.data_size; + RAY_CHECK(object_entry->object.data_size + object_entry->object.metadata_size <= + object_entry->object.allocated_size); } - return plasma_header; + return; } Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) { @@ -725,8 +736,9 @@ Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) { "ray.release() called on an object that is not mutable"); } - auto plasma_header = EnsureGetAcquired(entry); + EnsureGetAcquired(entry); RAY_LOG(DEBUG) << "Release shared object " << object_id; + auto plasma_header = GetPlasmaObjectHeader(entry->object); plasma_header->ReadRelease(entry->next_version_to_read); // The next read needs to read at least this version. entry->next_version_to_read++; diff --git a/src/ray/object_manager/plasma/common.h b/src/ray/object_manager/plasma/common.h index d74eb88cec8b8..d24a110c32e44 100644 --- a/src/ray/object_manager/plasma/common.h +++ b/src/ray/object_manager/plasma/common.h @@ -140,6 +140,10 @@ class LocalObject { GetObjectInfo().data_size; object->data_size = GetObjectInfo().data_size; object->metadata_size = GetObjectInfo().metadata_size; + // Senders and receivers of a channel may store different data and metadata + // sizes locally depending on what data is written to the channel, but the + // plasma store keeps the original data and metadata size. + object->allocated_size = object->data_size + object->metadata_size; object->device_num = GetAllocation().device_num; object->mmap_size = GetAllocation().mmap_size; } diff --git a/src/ray/object_manager/plasma/plasma.fbs b/src/ray/object_manager/plasma/plasma.fbs index ba2df089c6032..0c4f8ac66f48b 100644 --- a/src/ray/object_manager/plasma/plasma.fbs +++ b/src/ray/object_manager/plasma/plasma.fbs @@ -106,6 +106,10 @@ struct PlasmaObjectSpec { metadata_offset: ulong; // The size in bytes of the metadata. metadata_size: ulong; + // The allocated size. This is just data_size + metadata_size + // for immutable objects, but for mutable objects, the data size + // and metadata size may change. + allocated_size: ulong; // Device to create buffer on. device_num: int; } diff --git a/src/ray/object_manager/plasma/plasma.h b/src/ray/object_manager/plasma/plasma.h index 775226c922665..bb21f394d5b0c 100644 --- a/src/ray/object_manager/plasma/plasma.h +++ b/src/ray/object_manager/plasma/plasma.h @@ -48,6 +48,9 @@ struct PlasmaObject { int64_t data_size; /// The size in bytes of the metadata. int64_t metadata_size; + /// The size in bytes that was allocated. data_size + metadata_size must fit + /// within this. + int64_t allocated_size; /// Device number object is on. int device_num; /// Set if device_num is equal to 0. diff --git a/src/ray/object_manager/plasma/protocol.cc b/src/ray/object_manager/plasma/protocol.cc index 50f1f60d332ec..c041486bdefec 100644 --- a/src/ray/object_manager/plasma/protocol.cc +++ b/src/ray/object_manager/plasma/protocol.cc @@ -268,6 +268,7 @@ Status SendCreateReply(const std::shared_ptr &client, object.data_size, object.metadata_offset, object.metadata_size, + object.allocated_size, object.device_num); auto object_string = fbb.CreateString(object_id.Binary()); fb::PlasmaCreateReplyBuilder crb(fbb); @@ -309,6 +310,7 @@ Status ReadCreateReply(uint8_t *data, object->data_size = message->plasma_object()->data_size(); object->metadata_offset = message->plasma_object()->metadata_offset(); object->metadata_size = message->plasma_object()->metadata_size(); + object->allocated_size = message->plasma_object()->allocated_size(); store_fd->first = INT2FD(message->store_fd()); store_fd->second = message->unique_fd_id(); @@ -624,6 +626,7 @@ Status SendGetReply(const std::shared_ptr &client, object.data_size, object.metadata_offset, object.metadata_size, + object.allocated_size, object.device_num)); } std::vector store_fds_as_int; @@ -665,6 +668,7 @@ Status ReadGetReply(uint8_t *data, plasma_objects[i].data_size = object->data_size(); plasma_objects[i].metadata_offset = object->metadata_offset(); plasma_objects[i].metadata_size = object->metadata_size(); + plasma_objects[i].allocated_size = object->allocated_size(); plasma_objects[i].device_num = object->device_num(); } RAY_CHECK(message->store_fds()->size() == message->mmap_sizes()->size()); From cbe257fe381507342b76de30736b7ed4581a847f Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 30 Nov 2023 17:53:25 -0800 Subject: [PATCH 14/66] better error message Signed-off-by: Stephanie Wang --- python/ray/tests/test_accelerated_dag.py | 5 +++++ src/ray/object_manager/plasma/client.cc | 12 +++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py index e8ea4e7eb4a13..fa7805a5b9de0 100644 --- a/python/ray/tests/test_accelerated_dag.py +++ b/python/ray/tests/test_accelerated_dag.py @@ -39,6 +39,11 @@ def _test(val): _test(1000) _test(np.random.rand(10)) + with pytest.raises(ValueError): + _test(np.random.rand(100)) + + _test(np.random.rand(1)) + @pytest.mark.parametrize("num_readers", [1, 4]) def test_put_remote_get(ray_start_regular, num_readers): diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index 8a5637957a3ef..d76f028c0d916 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -416,12 +416,14 @@ Status PlasmaClient::Impl::WriteAcquireMutableObject(const ObjectID &object_id, // Wait for no readers. auto plasma_header = GetPlasmaObjectHeader(entry->object); - // TODO(swang): Better exception. // TODO(swang): Support data + metadata size larger than allocated buffer. - RAY_CHECK(data_size + metadata_size <= entry->object.allocated_size) - << "Cannot write mutable data size " << data_size << " + metadata size " - << metadata_size << " larger than allocated buffer size " - << entry->object.allocated_size; + if (data_size + metadata_size > entry->object.allocated_size) { + return Status::InvalidArgument("Serialized size of mutable data (" + + std::to_string(data_size) + ") + metadata size (" + + std::to_string(metadata_size) + + ") is larger than allocated buffer size " + + std::to_string(entry->object.allocated_size)); + } plasma_header->WriteAcquire( entry->next_version_to_write, data_size, metadata_size, num_readers); From a68cefd2e974f1e3b56f79e4bcb36a628e25c155 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Fri, 1 Dec 2023 10:48:49 -0800 Subject: [PATCH 15/66] cleanup Signed-off-by: Stephanie Wang --- python/ray/__init__.py | 3 - python/ray/_private/ray_perf.py | 22 +-- python/ray/_private/worker.py | 91 +++------- python/ray/_raylet.pxd | 2 +- python/ray/_raylet.pyx | 73 ++++---- python/ray/experimental/channel.py | 142 +++++++++++++++ python/ray/includes/libcoreworker.pxd | 6 +- python/ray/tests/test_accelerated_dag.py | 33 ++-- src/ray/core_worker/core_worker.cc | 47 +++-- src/ray/core_worker/core_worker.h | 39 +++- .../store_provider/plasma_store_provider.cc | 61 ++++--- .../store_provider/plasma_store_provider.h | 36 +++- src/ray/object_manager/common.cc | 9 - src/ray/object_manager/common.h | 33 ++-- src/ray/object_manager/plasma/client.cc | 166 +++++++++--------- src/ray/object_manager/plasma/client.h | 50 ++++-- 16 files changed, 519 insertions(+), 294 deletions(-) create mode 100644 python/ray/experimental/channel.py diff --git a/python/ray/__init__.py b/python/ray/__init__.py index 95ac20ceb15fb..e74749ab6e8fa 100644 --- a/python/ray/__init__.py +++ b/python/ray/__init__.py @@ -114,9 +114,6 @@ def _configure_system(): WORKER_MODE, RESTORE_WORKER_MODE, SPILL_WORKER_MODE, - _create_channel, - _write_channel, - _end_read_channel, cancel, get, get_actor, diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py index 1d07c49aea89f..dcc49d42b2926 100644 --- a/python/ray/_private/ray_perf.py +++ b/python/ray/_private/ray_perf.py @@ -8,6 +8,8 @@ import multiprocessing import ray +import ray.experimental.channel as ray_channel + logger = logging.getLogger(__name__) @@ -296,11 +298,11 @@ def async_actor_multi(): def put_channel_small(chans, num_readers=1, do_get=False, do_release=False): for chan in chans: - ray._write_channel(b"0", chan, num_readers=num_readers) + chan.write(b"0", num_readers=num_readers) if do_get: - ray.get(chan) + chan.begin_read() if do_release: - ray._end_read_channel(chan) + chan.end_read() @ray.remote class ChannelReader: @@ -310,10 +312,10 @@ def ready(self): def read(self, chans): while True: for chan in chans: - ray.get(chan) - ray._end_read_channel(chan) + chan.begin_read() + chan.end_read() - chans = [ray._create_channel(1000)] + chans = [ray_channel.Channel(1000)] results += timeit( "local put, single channel calls", lambda: put_channel_small(chans, do_release=True), @@ -323,7 +325,7 @@ def read(self, chans): lambda: put_channel_small(chans, do_get=True, do_release=True), ) - chans = [ray._create_channel(1000)] + chans = [ray_channel.Channel(1000)] reader = ChannelReader.remote() ray.get(reader.ready.remote()) reader.read.remote(chans) @@ -335,7 +337,7 @@ def read(self, chans): n_cpu = multiprocessing.cpu_count() // 2 print(f"Testing multiple readers/channels, n={n_cpu}") - chans = [ray._create_channel(1000)] + chans = [ray_channel.Channel(1000)] readers = [ChannelReader.remote() for _ in range(n_cpu)] ray.get([reader.ready.remote() for reader in readers]) for reader in readers: @@ -347,7 +349,7 @@ def read(self, chans): for reader in readers: ray.kill(reader) - chans = [ray._create_channel(1000) for _ in range(n_cpu)] + chans = [ray_channel.Channel(1000) for _ in range(n_cpu)] reader = ChannelReader.remote() ray.get(reader.ready.remote()) reader.read.remote(chans) @@ -356,7 +358,7 @@ def read(self, chans): ) ray.kill(reader) - chans = [ray._create_channel(1000) for _ in range(n_cpu)] + chans = [ray_channel.Channel(1000) for _ in range(n_cpu)] readers = [ChannelReader.remote() for _ in range(n_cpu)] ray.get([reader.ready.remote() for reader in readers]) for chan, reader in zip(chans, readers): diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py index fa93f8ea91851..216a651df87f7 100644 --- a/python/ray/_private/worker.py +++ b/python/ray/_private/worker.py @@ -688,7 +688,13 @@ def set_mode(self, mode): def set_load_code_from_local(self, load_code_from_local): self._load_code_from_local = load_code_from_local - def put_object(self, value, object_ref=None, owner_address=None, is_mutable=False): + def put_object( + self, + value: Any, + object_ref: Optional["ray.ObjectRef"] = None, + owner_address: Optional[str] = None, + _is_experimental_mutable_object: bool = False, + ): """Put value in the local object store with object reference `object_ref`. This assumes that the value for `object_ref` has not yet been placed in @@ -703,6 +709,10 @@ def put_object(self, value, object_ref=None, owner_address=None, is_mutable=Fals object_ref: The object ref of the value to be put. If None, one will be generated. owner_address: The serialized address of object's owner. + _is_experimental_mutable_object: An experimental flag for mutable + objects. If True, then the returned object will not have a + valid value. The object must be written to using the + ray.experimental.channel API before readers can read. Returns: ObjectRef: The object ref the object was put under. @@ -739,7 +749,7 @@ def put_object(self, value, object_ref=None, owner_address=None, is_mutable=Fals # If the object is mutable, then the raylet should never read the # object. Instead, clients will keep the object pinned. - pin_object = not is_mutable + pin_object = not _is_experimental_mutable_object # This *must* be the first place that we construct this python # ObjectRef because an entry with 0 local references is created when @@ -753,7 +763,7 @@ def put_object(self, value, object_ref=None, owner_address=None, is_mutable=Fals object_ref=object_ref, pin_object=pin_object, owner_address=owner_address, - is_mutable=is_mutable, + _is_experimental_mutable_object=_is_experimental_mutable_object, ), # The initial local reference is already acquired internally. skip_adding_local_ref=True, @@ -775,7 +785,12 @@ def deserialize_objects(self, data_metadata_pairs, object_refs): context = self.get_serialization_context() return context.deserialize_objects(data_metadata_pairs, object_refs) - def get_objects(self, object_refs: list, timeout: Optional[float] = None): + def get_objects( + self, + object_refs: list, + timeout: Optional[float] = None, + _is_experimental_mutable_object: bool = False, + ): """Get the values in the object store associated with the IDs. Return the values from the local object store for object_refs. This @@ -791,6 +806,10 @@ def get_objects(self, object_refs: list, timeout: Optional[float] = None): list: List of deserialized objects bytes: UUID of the debugger breakpoint we should drop into or b"" if there is no breakpoint. + _is_experimental_mutable_object: An experimental flag for mutable + objects. If True, then wait until there is a value available to + read. The object must also already be local, or else the get + call will hang. """ # Make sure that the values are object refs. for object_ref in object_refs: @@ -802,7 +821,10 @@ def get_objects(self, object_refs: list, timeout: Optional[float] = None): timeout_ms = int(timeout * 1000) if timeout is not None else -1 data_metadata_pairs = self.core_worker.get_objects( - object_refs, self.current_task_id, timeout_ms + object_refs, + self.current_task_id, + timeout_ms, + _is_experimental_mutable_object, ) debugger_breakpoint = b"" for data, metadata in data_metadata_pairs: @@ -2498,20 +2520,6 @@ def show_in_dashboard(message: str, key: str = "", dtype: str = "text"): blocking_get_inside_async_warned = False -def _end_read_channel(object_refs): - """ - Signal to the writer that the channel is ready to write again. The read - begins when the caller calls ray.get and a written value is available. If - ray.get is not called first, then this call will block until a value is - written, then drop the value. - """ - worker = global_worker - worker.check_connected() - if isinstance(object_refs, ObjectRef): - object_refs = [object_refs] - worker.core_worker.get_release(object_refs) - - @overload def get( object_refs: "Sequence[ObjectRef[Any]]", *, timeout: Optional[float] = None @@ -2643,51 +2651,6 @@ def get( return values -@PublicAPI -def _write_channel(value: Any, object_ref: ObjectRef, num_readers: int): - worker = global_worker - worker.check_connected() - - if num_readers <= 0: - raise ValueError("``num_readers`` must be a positive integer.") - - try: - serialized_value = worker.get_serialization_context().serialize(value) - except TypeError as e: - sio = io.StringIO() - ray.util.inspect_serializability(value, print_file=sio) - msg = ( - "Could not serialize the put value " f"{repr(value)}:\n" f"{sio.getvalue()}" - ) - raise TypeError(msg) from e - - worker.core_worker.put_serialized_object_to_mutable_plasma_object( - serialized_value, - object_ref, - num_readers, - ) - - -@PublicAPI -def _create_channel( - buffer_size: int, -) -> "ray.ObjectRef": - worker = global_worker - worker.check_connected() - - value = b"0" * buffer_size - - try: - object_ref = worker.put_object(value, owner_address=None, is_mutable=True) - except ObjectStoreFullError: - logger.info( - "Put failed since the value was either too large or the " - "store was full of pinned objects." - ) - raise - return object_ref - - @PublicAPI @client_mode_hook def put( diff --git a/python/ray/_raylet.pxd b/python/ray/_raylet.pxd index f4f54cffacec0..5d47073b74e8a 100644 --- a/python/ray/_raylet.pxd +++ b/python/ray/_raylet.pxd @@ -135,7 +135,7 @@ cdef class CoreWorker: c_bool created_by_worker, owner_address=*, c_bool inline_small_object=*, - c_bool is_mutable=*) + c_bool is_experimental_mutable_object=*) cdef unique_ptr[CAddress] _convert_python_address(self, address=*) cdef store_task_output( self, serialized_object, diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 3aa2422180c07..44694657d9c61 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -3324,25 +3324,19 @@ cdef class CoreWorker: return self.plasma_event_handler def get_objects(self, object_refs, TaskID current_task_id, - int64_t timeout_ms=-1): + int64_t timeout_ms=-1, + c_bool _is_experimental_mutable_object=False): cdef: c_vector[shared_ptr[CRayObject]] results CTaskID c_task_id = current_task_id.native() c_vector[CObjectID] c_object_ids = ObjectRefsToVector(object_refs) with nogil: op_status = CCoreWorkerProcess.GetCoreWorker().Get( - c_object_ids, timeout_ms, &results) + c_object_ids, timeout_ms, _is_experimental_mutable_object, &results) check_status(op_status) return RayObjectsToDataMetadataPairs(results) - def get_release(self, object_refs): - cdef: - c_vector[CObjectID] c_object_ids = ObjectRefsToVector(object_refs) - with nogil: - op_status = CCoreWorkerProcess.GetCoreWorker().GetRelease(c_object_ids) - check_status(op_status) - def get_if_local(self, object_refs): """Get objects from local plasma store directly without a fetch request to raylet.""" @@ -3374,7 +3368,7 @@ cdef class CoreWorker: c_bool created_by_worker, owner_address=None, c_bool inline_small_object=True, - c_bool is_mutable=False, + c_bool is_experimental_mutable_object=False, ): cdef: unique_ptr[CAddress] c_owner_address @@ -3385,7 +3379,8 @@ cdef class CoreWorker: with nogil: check_status(CCoreWorkerProcess.GetCoreWorker() .CreateOwnedAndIncrementLocalRef( - is_mutable, metadata, data_size, contained_ids, + is_experimental_mutable_object, metadata, + data_size, contained_ids, c_object_id, data, created_by_worker, move(c_owner_address), inline_small_object)) @@ -3474,10 +3469,10 @@ cdef class CoreWorker: generator_id=CObjectID.Nil(), owner_address=c_owner_address)) - def put_serialized_object_to_mutable_plasma_object(self, serialized_object, - ObjectRef object_ref, - num_readers, - ): + def experimental_mutable_object_put_serialized(self, serialized_object, + ObjectRef object_ref, + num_readers, + ): cdef: CObjectID c_object_id = object_ref.native() shared_ptr[CBuffer] data @@ -3485,13 +3480,14 @@ cdef class CoreWorker: metadata = string_to_buffer(serialized_object.metadata) data_size = serialized_object.total_bytes - check_status(CCoreWorkerProcess.GetCoreWorker().WriteAcquireMutableObject( - c_object_id, - metadata, - data_size, - num_readers, - &data, - )) + check_status(CCoreWorkerProcess.GetCoreWorker() + .ExperimentalMutableObjectWriteAcquire( + c_object_id, + metadata, + data_size, + num_readers, + &data, + )) if data_size > 0: (serialized_object).write_to( Buffer.make(data)) @@ -3501,13 +3497,30 @@ cdef class CoreWorker: generator_id=CObjectID.Nil(), owner_address=null_owner_address)) - def put_serialized_object_and_increment_local_ref(self, serialized_object, - ObjectRef object_ref=None, - c_bool pin_object=True, - owner_address=None, - c_bool inline_small_object=True, - c_bool is_mutable=False, - ): + def experimental_mutable_object_read_release(self, object_refs): + """ + For experimental.channel.Channel. + + Signal to the writer that the channel is ready to write again. The read + began when the caller calls ray.get and a written value is available. If + ray.get is not called first, then this call will block until a value is + written, then drop the value. + """ + cdef: + c_vector[CObjectID] c_object_ids = ObjectRefsToVector(object_refs) + with nogil: + op_status = (CCoreWorkerProcess.GetCoreWorker() + .ExperimentalMutableObjectReadRelease(c_object_ids)) + check_status(op_status) + + def put_serialized_object_and_increment_local_ref( + self, serialized_object, + ObjectRef object_ref=None, + c_bool pin_object=True, + owner_address=None, + c_bool inline_small_object=True, + c_bool _is_experimental_mutable_object=False, + ): cdef: CObjectID c_object_id shared_ptr[CBuffer] data @@ -3524,7 +3537,7 @@ cdef class CoreWorker: metadata, total_bytes, object_ref, contained_object_ids, &c_object_id, &data, True, owner_address, inline_small_object, - is_mutable) + _is_experimental_mutable_object) logger.debug( f"Serialized object size of {c_object_id.Hex()} is {total_bytes} bytes") diff --git a/python/ray/experimental/channel.py b/python/ray/experimental/channel.py new file mode 100644 index 0000000000000..42f82e4aa3398 --- /dev/null +++ b/python/ray/experimental/channel.py @@ -0,0 +1,142 @@ +import io +import logging +from typing import Any, Optional + +import ray +from ray.util.annotations import PublicAPI + +# Logger for this module. It should be configured at the entry point +# into the program using Ray. Ray provides a default configuration at +# entry/init points. +logger = logging.getLogger(__name__) + + +def _create_channel_ref( + buffer_size: int, +) -> "ray.ObjectRef": + """ + Create a channel that can be read and written by co-located Ray processes. + + The channel has no buffer, so the writer will block until reader(s) have + read the previous value. Only the channel creator may write to the channel. + + Args: + buffer_size: The number of bytes to allocate for the object data and + metadata. Writes to the channel must produce serialized data and + metadata less than or equal to this value. + Returns: + Channel: A wrapper around ray.ObjectRef. + """ + worker = ray._private.worker.global_worker + worker.check_connected() + + value = b"0" * buffer_size + + try: + object_ref = worker.put_object( + value, owner_address=None, _is_experimental_mutable_object=True + ) + except ray.exceptions.ObjectStoreFullError: + logger.info( + "Put failed since the value was either too large or the " + "store was full of pinned objects." + ) + raise + return object_ref + + +@PublicAPI(stability="alpha") +class Channel: + """ + A wrapper type for ray.ObjectRef. Currently supports ray.get but not + ray.wait. + """ + + def __init__(self, buffer_size: Optional[int] = None): + """ + Create a channel that can be read and written by co-located Ray processes. + + Only the caller may write to the channel. The channel has no buffer, + so the writer will block until reader(s) have read the previous value. + + Args: + buffer_size: The number of bytes to allocate for the object data and + metadata. Writes to the channel must produce serialized data and + metadata less than or equal to this value. + Returns: + Channel: A wrapper around ray.ObjectRef. + """ + if buffer_size is None: + self._base_ref = None + else: + self._base_ref = _create_channel_ref(buffer_size) + + self.worker = ray._private.worker.global_worker + self.worker.check_connected() + + @staticmethod + def _from_base_ref(base_ref: "ray.ObjectRef") -> "Channel": + chan = Channel() + chan._base_ref = base_ref + return chan + + def __reduce__(self): + return self._from_base_ref, (self._base_ref,) + + def write(self, value: Any, num_readers: int): + """ + Write a value to the channel. + + Blocks if there are still pending readers for the previous value. The + writer may not write again until the specified number of readers have + called ``end_read_channel``. + + Args: + value: The value to write. + num_readers: The number of readers that must read and release the value + before we can write again. + """ + if num_readers <= 0: + raise ValueError("``num_readers`` must be a positive integer.") + + try: + serialized_value = self.worker.get_serialization_context().serialize(value) + except TypeError as e: + sio = io.StringIO() + ray.util.inspect_serializability(value, print_file=sio) + msg = ( + "Could not serialize the put value " + f"{repr(value)}:\n" + f"{sio.getvalue()}" + ) + raise TypeError(msg) from e + + self.worker.core_worker.experimental_mutable_object_put_serialized( + serialized_value, + self._base_ref, + num_readers, + ) + + def begin_read(self) -> Any: + """ + Read the latest value from the channel. This call will block until a + value is available to read. + + Returns: + Any: The deserialized value. + """ + values, _ = self.worker.get_objects( + [self._base_ref], _is_experimental_mutable_object=True + ) + return values[0] + + def end_read(self): + """ + Signal to the writer that the channel is ready to write again. + + If begin_read is not called first, then this call will block until a + value is written, then drop the value. + """ + self.worker.core_worker.experimental_mutable_object_read_release( + [self._base_ref] + ) diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index bf6a6e810fc9d..00bf9b5f9d4e6 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -240,7 +240,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: const CAddress &owner_address, shared_ptr[CBuffer] *data, c_bool created_by_worker) - CRayStatus WriteAcquireMutableObject( + CRayStatus ExperimentalMutableObjectWriteAcquire( const CObjectID &object_id, const shared_ptr[CBuffer] &metadata, uint64_t data_size, @@ -251,8 +251,10 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: CRayStatus SealExisting(const CObjectID &object_id, c_bool pin_object, const CObjectID &generator_id, const unique_ptr[CAddress] &owner_address) - CRayStatus GetRelease(const c_vector[CObjectID] &object_ids) + CRayStatus ExperimentalMutableObjectReadRelease( + const c_vector[CObjectID] &object_ids) CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms, + c_bool is_experimental_mutable_object, c_vector[shared_ptr[CRayObject]] *results) CRayStatus GetIfLocal( const c_vector[CObjectID] &ids, diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py index fa7805a5b9de0..3795baaddf9fc 100644 --- a/python/ray/tests/test_accelerated_dag.py +++ b/python/ray/tests/test_accelerated_dag.py @@ -8,31 +8,34 @@ import ray import ray.cluster_utils +import ray.experimental.channel as ray_channel logger = logging.getLogger(__name__) def test_put_local_get(ray_start_regular): - ref = ray._create_channel(1000) + chan = ray_channel.Channel(1000) num_writes = 1000 for i in range(num_writes): val = i.to_bytes(8, "little") - ray._write_channel(val, ref, num_readers=1) - assert ray.get(ref) == val - ray._end_read_channel(ref) + chan.write(val, num_readers=1) + assert chan.begin_read() == val + chan.end_read() def test_put_different_meta(ray_start_regular): - ref = ray._create_channel(1000) + chan = ray_channel.Channel(1000) def _test(val): - ray._write_channel(val, ref, num_readers=1) + chan.write(val, num_readers=1) + + read_val = chan.begin_read() if isinstance(val, np.ndarray): - assert np.array_equal(ray.get(ref), val) + assert np.array_equal(read_val, val) else: - assert ray.get(ref) == val - ray._end_read_channel(ref) + assert read_val == val + chan.end_read() _test(b"hello") _test("hello") @@ -47,25 +50,25 @@ def _test(val): @pytest.mark.parametrize("num_readers", [1, 4]) def test_put_remote_get(ray_start_regular, num_readers): - ref = ray._create_channel(1000) + chan = ray_channel.Channel(1000) @ray.remote(num_cpus=0) class Reader: def __init__(self): pass - def read(self, ref, num_writes): + def read(self, chan, num_writes): for i in range(num_writes): val = i.to_bytes(8, "little") - assert ray.get(ref[0]) == val - ray._end_read_channel(ref) + assert chan.begin_read() == val + chan.end_read() num_writes = 1000 readers = [Reader.remote() for _ in range(num_readers)] - done = [reader.read.remote([ref], num_writes) for reader in readers] + done = [reader.read.remote(chan, num_writes) for reader in readers] for i in range(num_writes): val = i.to_bytes(8, "little") - ray._write_channel(val, ref, num_readers=num_readers) + chan.write(val, num_readers=num_readers) ray.get(done) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 72f5b698d9234..d58640908c905 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1219,7 +1219,7 @@ Status CoreWorker::Put(const RayObject &object, } Status CoreWorker::CreateOwnedAndIncrementLocalRef( - bool is_mutable, + bool is_experimental_mutable_object, const std::shared_ptr &metadata, const size_t data_size, const std::vector &contained_object_ids, @@ -1295,7 +1295,7 @@ Status CoreWorker::CreateOwnedAndIncrementLocalRef( /* owner_address = */ real_owner_address, data, created_by_worker, - is_mutable); + is_experimental_mutable_object); } if (!status.ok()) { RemoveLocalReference(*object_id); @@ -1326,12 +1326,13 @@ Status CoreWorker::CreateExisting(const std::shared_ptr &metadata, } } -Status CoreWorker::WriteAcquireMutableObject(const ObjectID &object_id, - const std::shared_ptr &metadata, - uint64_t data_size, - int64_t num_readers, - std::shared_ptr *data) { - return plasma_store_provider_->WriteAcquireMutableObject( +Status CoreWorker::ExperimentalMutableObjectWriteAcquire( + const ObjectID &object_id, + const std::shared_ptr &metadata, + uint64_t data_size, + int64_t num_readers, + std::shared_ptr *data) { + return plasma_store_provider_->ExperimentalMutableObjectWriteAcquire( object_id, metadata, data_size, num_readers, data); } @@ -1378,13 +1379,15 @@ Status CoreWorker::SealExisting(const ObjectID &object_id, return Status::OK(); } -Status CoreWorker::GetRelease(const std::vector &object_ids) { +Status CoreWorker::ExperimentalMutableObjectReadRelease( + const std::vector &object_ids) { RAY_CHECK(object_ids.size() == 1); - return plasma_store_provider_->GetRelease(object_ids[0]); + return plasma_store_provider_->ExperimentalMutableObjectReadRelease(object_ids[0]); } Status CoreWorker::Get(const std::vector &ids, const int64_t timeout_ms, + bool is_experimental_mutable_object, std::vector> *results) { std::unique_ptr state = nullptr; if (options_.worker_type == WorkerType::WORKER) { @@ -1452,6 +1455,7 @@ Status CoreWorker::Get(const std::vector &ids, RAY_LOG(DEBUG) << "Plasma GET timeout " << local_timeout_ms; RAY_RETURN_NOT_OK(plasma_store_provider_->Get(plasma_object_ids, local_timeout_ms, + is_experimental_mutable_object, worker_context_, &result_map, &got_exception)); @@ -2904,8 +2908,12 @@ bool CoreWorker::PinExistingReturnObject(const ObjectID &return_id, reference_counter_->AddLocalReference(return_id, ""); reference_counter_->AddBorrowedObject(return_id, ObjectID::Nil(), owner_address); - auto status = plasma_store_provider_->Get( - {return_id}, 0, worker_context_, &result_map, &got_exception); + auto status = plasma_store_provider_->Get({return_id}, + 0, + /*is_experimental_mutable_object=*/false, + worker_context_, + &result_map, + &got_exception); // Remove the temporary ref. RemoveLocalReference(return_id); @@ -3168,8 +3176,13 @@ Status CoreWorker::GetAndPinArgsForExecutor(const TaskSpecification &task, RAY_RETURN_NOT_OK( memory_store_->Get(by_ref_ids, -1, worker_context_, &result_map, &got_exception)); } else { - RAY_RETURN_NOT_OK(plasma_store_provider_->Get( - by_ref_ids, -1, worker_context_, &result_map, &got_exception)); + RAY_RETURN_NOT_OK( + plasma_store_provider_->Get(by_ref_ids, + -1, + /*is_experimental_mutable_object=*/false, + worker_context_, + &result_map, + &got_exception)); } for (const auto &it : result_map) { for (size_t idx : by_ref_indices[it.first]) { @@ -4163,7 +4176,11 @@ void CoreWorker::PlasmaCallback(SetResultCallback success, bool object_is_local = false; if (Contains(object_id, &object_is_local).ok() && object_is_local) { std::vector> vec; - if (Get(std::vector{object_id}, 0, &vec).ok()) { + if (Get(std::vector{object_id}, + 0, + /*is_experimental_mutable_object=*/false, + &vec) + .ok()) { RAY_CHECK(vec.size() > 0) << "Failed to get local object but Raylet notified object is local."; return success(vec.front(), object_id, py_future); diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 3db71661d8695..ea01d202fba75 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -602,6 +602,10 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// ensure that they decrement the ref count once the returned ObjectRef has /// gone out of scope. /// + /// \param[in] is_experimental_mutable_object Whether this object is an + /// experimental mutable object. If true, then the returned object buffer + /// will not be available to read until the caller Seals and then writes + /// again. /// \param[in] metadata Metadata of the object to be written. /// \param[in] data_size Size of the object to be written. /// \param[in] contained_object_ids The IDs serialized in this object. @@ -614,7 +618,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// small. /// \return Status. Status CreateOwnedAndIncrementLocalRef( - bool is_mutable, + bool is_experimental_mutable_object, const std::shared_ptr &metadata, const size_t data_size, const std::vector &contained_object_ids, @@ -643,12 +647,6 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { std::shared_ptr *data, bool created_by_worker); - Status WriteAcquireMutableObject(const ObjectID &object_id, - const std::shared_ptr &metadata, - uint64_t data_size, - int64_t num_readers, - std::shared_ptr *data); - /// Finalize placing an object into the object store. This should be called after /// a corresponding `CreateOwned()` call and then writing into the returned buffer. /// @@ -682,7 +680,31 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { const ObjectID &generator_id = ObjectID::Nil(), const std::unique_ptr &owner_address = nullptr); - Status GetRelease(const std::vector &object_ids); + /// Experimental method for mutable objects. Acquires a write lock on the + /// object that prevents readers from reading until we are done writing. Does + /// not protect against concurrent writers. + /// + /// \param[in] object_id The ID of the object. + /// \param[in] metadata The metadata of the object. This overwrites the + /// current metadata. + /// \param[in] data_size The size of the object to write. This overwrites the + /// current data size. + /// \param[in] num_readers The number of readers that must read and release + /// the object before the caller can write again. + /// \param[out] data The mutable object buffer in plasma that can be written to. + Status ExperimentalMutableObjectWriteAcquire(const ObjectID &object_id, + const std::shared_ptr &metadata, + uint64_t data_size, + int64_t num_readers, + std::shared_ptr *data); + + /// Experimental method for mutable objects. Releases the objects, allowing them + /// to be written again. If the caller did not previously Get the objects, + /// then this first blocks until the latest value is available to read, then + /// releases the value. + /// + /// \param[in] object_ids The IDs of the objects. + Status ExperimentalMutableObjectReadRelease(const std::vector &object_ids); /// Get a list of objects from the object store. Objects that failed to be retrieved /// will be returned as nullptrs. @@ -693,6 +715,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// \return Status. Status Get(const std::vector &ids, const int64_t timeout_ms, + bool is_experimental_mutable_object, std::vector> *results); /// Get objects directly from the local plasma store, without waiting for the diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.cc b/src/ray/core_worker/store_provider/plasma_store_provider.cc index 30ae14daef662..6bfd686a987f1 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.cc +++ b/src/ray/core_worker/store_provider/plasma_store_provider.cc @@ -108,18 +108,19 @@ Status CoreWorkerPlasmaStoreProvider::Put(const RayObject &object, return Status::OK(); } -Status CoreWorkerPlasmaStoreProvider::WriteAcquireMutableObject( +Status CoreWorkerPlasmaStoreProvider::ExperimentalMutableObjectWriteAcquire( const ObjectID &object_id, const std::shared_ptr &metadata, uint64_t data_size, int64_t num_readers, std::shared_ptr *data) { - return store_client_.WriteAcquireMutableObject(object_id, - data_size, - metadata ? metadata->Data() : nullptr, - metadata ? metadata->Size() : 0, - num_readers, - data); + return store_client_.ExperimentalMutableObjectWriteAcquire( + object_id, + data_size, + metadata ? metadata->Data() : nullptr, + metadata ? metadata->Size() : 0, + num_readers, + data); } Status CoreWorkerPlasmaStoreProvider::Create(const std::shared_ptr &metadata, @@ -181,19 +182,21 @@ Status CoreWorkerPlasmaStoreProvider::FetchAndGetFromPlasmaStore( absl::flat_hash_set &remaining, const std::vector &batch_ids, int64_t timeout_ms, + bool send_fetch_or_reconstruct_ipc, bool fetch_only, bool in_direct_call, const TaskID &task_id, absl::flat_hash_map> *results, bool *got_exception) { - const auto owner_addresses = reference_counter_->GetOwnerAddresses(batch_ids); - // TODO this IPC needs to be skipped in shared mode - // RAY_RETURN_NOT_OK( - // raylet_client_->FetchOrReconstruct(batch_ids, - // owner_addresses, - // fetch_only, - // /*mark_worker_blocked*/ !in_direct_call, - // task_id)); + if (send_fetch_or_reconstruct_ipc) { + const auto owner_addresses = reference_counter_->GetOwnerAddresses(batch_ids); + RAY_RETURN_NOT_OK( + raylet_client_->FetchOrReconstruct(batch_ids, + owner_addresses, + fetch_only, + /*mark_worker_blocked*/ !in_direct_call, + task_id)); + } std::vector plasma_results; RAY_RETURN_NOT_OK(store_client_.Get(batch_ids, @@ -232,8 +235,9 @@ Status CoreWorkerPlasmaStoreProvider::FetchAndGetFromPlasmaStore( return Status::OK(); } -Status CoreWorkerPlasmaStoreProvider::GetRelease(const ObjectID &object_id) { - return store_client_.GetRelease(object_id); +Status CoreWorkerPlasmaStoreProvider::ExperimentalMutableObjectReadRelease( + const ObjectID &object_id) { + return store_client_.ExperimentalMutableObjectReadRelease(object_id); } Status CoreWorkerPlasmaStoreProvider::GetIfLocal( @@ -287,6 +291,7 @@ Status UnblockIfNeeded(const std::shared_ptr &client, Status CoreWorkerPlasmaStoreProvider::Get( const absl::flat_hash_set &object_ids, int64_t timeout_ms, + bool is_experimental_mutable_object, const WorkerContext &ctx, absl::flat_hash_map> *results, bool *got_exception) { @@ -302,14 +307,17 @@ Status CoreWorkerPlasmaStoreProvider::Get( for (int64_t i = start; i < batch_size && i < total_size; i++) { batch_ids.push_back(id_vector[start + i]); } - RAY_RETURN_NOT_OK(FetchAndGetFromPlasmaStore(remaining, - batch_ids, - /*timeout_ms=*/0, - /*fetch_only=*/true, - ctx.CurrentTaskIsDirectCall(), - ctx.GetCurrentTaskID(), - results, - got_exception)); + RAY_RETURN_NOT_OK(FetchAndGetFromPlasmaStore( + remaining, + batch_ids, + /*timeout_ms=*/0, + // Mutable objects must be local before ray.get. + /*send_fetch_or_reconstruct_ipc=*/!is_experimental_mutable_object, + /*fetch_only=*/true, + ctx.CurrentTaskIsDirectCall(), + ctx.GetCurrentTaskID(), + results, + got_exception)); } // If all objects were fetched already, return. Note that we always need to @@ -318,6 +326,8 @@ Status CoreWorkerPlasmaStoreProvider::Get( return UnblockIfNeeded(raylet_client_, ctx); } + RAY_CHECK(!is_experimental_mutable_object) << "Mutable objects must always be local"; + // If not all objects were successfully fetched, repeatedly call FetchOrReconstruct // and Get from the local object store in batches. This loop will run indefinitely // until the objects are all fetched if timeout is -1. @@ -351,6 +361,7 @@ Status CoreWorkerPlasmaStoreProvider::Get( RAY_RETURN_NOT_OK(FetchAndGetFromPlasmaStore(remaining, batch_ids, batch_timeout, + /*send_fetch_or_reconstruct_ipc=*/true, /*fetch_only=*/false, ctx.CurrentTaskIsDirectCall(), ctx.GetCurrentTaskID(), diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.h b/src/ray/core_worker/store_provider/plasma_store_provider.h index 2c7242a02f4a1..fff93c48c2e4a 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.h +++ b/src/ray/core_worker/store_provider/plasma_store_provider.h @@ -129,12 +129,6 @@ class CoreWorkerPlasmaStoreProvider { bool created_by_worker, bool is_mutable = false); - Status WriteAcquireMutableObject(const ObjectID &object_id, - const std::shared_ptr &metadata, - uint64_t data_size, - int64_t num_readers, - std::shared_ptr *data); - /// Seal an object buffer created with Create(). /// /// NOTE: The caller must subsequently call Release() to release the first reference to @@ -154,12 +148,11 @@ class CoreWorkerPlasmaStoreProvider { Status Get(const absl::flat_hash_set &object_ids, int64_t timeout_ms, + bool is_experimental_mutable_object, const WorkerContext &ctx, absl::flat_hash_map> *results, bool *got_exception); - Status GetRelease(const ObjectID &object_id); - /// Get objects directly from the local plasma store, without waiting for the /// objects to be fetched from another node. This should only be used /// internally, never by user code. @@ -189,6 +182,32 @@ class CoreWorkerPlasmaStoreProvider { std::string MemoryUsageString(); + /// Experimental method for mutable objects. Acquires a write lock on the + /// object that prevents readers from reading until we are done writing. Does + /// not protect against concurrent writers. + /// + /// \param[in] object_id The ID of the object. + /// \param[in] metadata The metadata of the object. This overwrites the + /// current metadata. + /// \param[in] data_size The size of the object to write. This overwrites the + /// current data size. + /// \param[in] num_readers The number of readers that must read and release + /// the object before the caller can write again. + /// \param[out] data The mutable object buffer in plasma that can be written to. + Status ExperimentalMutableObjectWriteAcquire(const ObjectID &object_id, + const std::shared_ptr &metadata, + uint64_t data_size, + int64_t num_readers, + std::shared_ptr *data); + + /// Experimental method for mutable objects. Releases the objects, allowing them + /// to be written again. If the caller did not previously Get the objects, + /// then this first blocks until the latest value is available to read, then + /// releases the value. + /// + /// \param[in] object_id The ID of the object. + Status ExperimentalMutableObjectReadRelease(const ObjectID &object_id); + private: /// Ask the raylet to fetch a set of objects and then attempt to get them /// from the local plasma store. Successfully fetched objects will be removed @@ -211,6 +230,7 @@ class CoreWorkerPlasmaStoreProvider { absl::flat_hash_set &remaining, const std::vector &batch_ids, int64_t timeout_ms, + bool send_fetch_or_reconstruct_ipc, bool fetch_only, bool in_direct_call_task, const TaskID &task_id, diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc index 7c95845012896..a362f784e41f7 100644 --- a/src/ray/object_manager/common.cc +++ b/src/ray/object_manager/common.cc @@ -37,15 +37,6 @@ void PlasmaObjectHeader::Destroy() { RAY_CHECK(sem_destroy(&rw_semaphore) == 0); } -// Get the data size of the plasma object. -// This has to be called only when reader lock is acquired -// via ReadAcquire. -uint64_t PlasmaObjectHeader::GetDataSize() const { - RAY_CHECK_NE(num_read_releases_remaining, 0) - << "ReadAcquire has to be called before calling this method."; - return data_size; -} - void PlasmaObjectHeader::WriteAcquire(int64_t write_version, uint64_t write_data_size, uint64_t write_metadata_size, diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h index 31083942a928d..6d9a8655dd2c9 100644 --- a/src/ray/object_manager/common.h +++ b/src/ray/object_manager/common.h @@ -39,6 +39,12 @@ using RestoreSpilledObjectCallback = const std::string &, std::function)>; +/// A header for all plasma objects that is allocated and stored in shared +/// memory. Therefore, it can be accessed across processes. +/// +/// For normal immutable objects, no synchronization between processes is +/// needed once the object has been Sealed. For experimental mutable objects, +/// we use the header to synchronize between writer and readers. struct PlasmaObjectHeader { // Used to signal to the writer when all readers are done. sem_t rw_semaphore; @@ -77,8 +83,10 @@ struct PlasmaObjectHeader { void Destroy(); - // Blocks until there are no more readers. - // NOTE: Caller should ensure there is one writer at a time. + /// Blocks until all readers for the previous write have ReadRelease'd the value. + /// Caller must ensure there is one writer at a time. Caller must pass + /// consecutive versions on each new write, starting with write_version=1. + /// /// \param write_version The new version for write. /// \param data_size The new data size of the object. /// \param metadata_size The new metadata size of the object. @@ -90,23 +98,28 @@ struct PlasmaObjectHeader { // Call after completing a write to signal that readers may read. // num_readers should be set before calling this. + /// + /// \param write_version The new version for write. This must match the + /// version previously passed to WriteAcquire. void WriteRelease(int64_t write_version); // Blocks until the given version or a more recent version is ready to read. + // If num_readers have already read this version, then this call will hang. // // \param read_version The minimum version to wait for. // \return The version that was read. This should be passed to ReadRelease - // when the reader is done. + // when the reader is done. Returns 0 if the object is a normal immutable + // object, meaning no ReadRelease is needed. + /// + /// \param read_version Read at least this version. int64_t ReadAcquire(int64_t read_version); - // Finishes the read. If all reads are done, signals to the - // writer. This is not necessary to call for objects that have - // num_readers=-1. + // Finishes the read. If all reads are done, signals to the writer. This is + // not necessary to call for objects that have num_readers=-1. + /// + /// \param read_version This must match the version previously passed in + /// ReadAcquire. void ReadRelease(int64_t read_version); - - // Get the data size of the plasma object. - // The reader must first ReadAcquire. - uint64_t GetDataSize() const; }; /// A struct that includes info about the object. diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index d76f028c0d916..1741e72b1315d 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -95,14 +95,20 @@ struct ObjectInUseEntry { PlasmaObject object; /// A flag representing whether the object has been sealed. bool is_sealed; + + /// The below fields are experimental and used to implement + /// ray.experimental.channel. + /// + /// Whether the object is mutable. Most objects are immutable and cannot be + /// written to after the initial Create and Seal call. Mutable objects are + /// used to implement ray.experimental.channel. bool is_mutable = false; - /// For shared objects only. /// The last version that we read. To read again, we must pass a newer /// version than this. int64_t next_version_to_read = 1; /// Whether we currently have a read lock on the object. If this is true, /// then it is safe to read the value of the object. For immutable objects, - /// this will always be true once the object has been sealed. For immutable + /// this will always be true once the object has been sealed. For mutable /// objects, ReadRelease resets this to false, and ReadAcquire resets to /// true. bool read_acquired = false; @@ -150,12 +156,12 @@ class PlasmaClient::Impl : public std::enable_shared_from_this *data); + Status ExperimentalMutableObjectWriteAcquire(const ObjectID &object_id, + int64_t data_size, + const uint8_t *metadata, + int64_t metadata_size, + int64_t num_readers, + std::shared_ptr *data); Status Get(const std::vector &object_ids, int64_t timeout_ms, @@ -170,7 +176,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this &object_entry); - Status GetRelease(const ObjectID &object_id); + Status ExperimentalMutableObjectReadRelease(const ObjectID &object_id); Status Release(const ObjectID &object_id); @@ -226,9 +232,11 @@ class PlasmaClient::Impl : public std::enable_shared_from_this object, + bool is_sealed); + + void IncrementObjectCount(const ObjectID &object_id); /// The boost::asio IO context for the client. instrumented_io_context main_service_; @@ -306,31 +314,28 @@ bool PlasmaClient::Impl::IsInUse(const ObjectID &object_id) { return (elem != objects_in_use_.end()); } -void PlasmaClient::Impl::IncrementObjectCount(const ObjectID &object_id, - const PlasmaObject *object, - bool is_sealed) { +void PlasmaClient::Impl::InsertObjectInUse(const ObjectID &object_id, + std::unique_ptr object, + bool is_sealed) { + auto inserted = + objects_in_use_.insert({object_id, std::make_unique()}); + RAY_CHECK(inserted.second) << "Object already in use"; + auto it = inserted.first; + + // Add this object ID to the hash table of object IDs in use. The + // corresponding call to free happens in PlasmaClient::Release. + it->second->object = *object.release(); + // Count starts at 1 to pin the object. + it->second->count = 1; + it->second->is_sealed = is_sealed; +} + +void PlasmaClient::Impl::IncrementObjectCount(const ObjectID &object_id) { // Increment the count of the object to track the fact that it is being used. // The corresponding decrement should happen in PlasmaClient::Release. - auto elem = objects_in_use_.find(object_id); - ObjectInUseEntry *object_entry; - if (elem == objects_in_use_.end()) { - RAY_CHECK(object != nullptr); - // Add this object ID to the hash table of object IDs in use. The - // corresponding call to free happens in PlasmaClient::Release. - objects_in_use_[object_id] = std::make_unique(); - objects_in_use_[object_id]->object = *object; - objects_in_use_[object_id]->count = 0; - objects_in_use_[object_id]->is_sealed = is_sealed; - object_entry = objects_in_use_[object_id].get(); - } else { - object_entry = elem->second.get(); - // TODO(swang): Nicer way to pin shared objects. - // RAY_CHECK(object_entry->count > 0); - } - // Increment the count of the number of instances of this object that are - // being used by this client. The corresponding decrement should happen in - // PlasmaClient::Release. - object_entry->count += 1; + auto object_entry = objects_in_use_.find(object_id); + RAY_CHECK(object_entry != objects_in_use_.end()); + object_entry->second->count += 1; } Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id, @@ -340,7 +345,7 @@ Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id, std::vector buffer; RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaCreateReply, &buffer)); ObjectID id; - PlasmaObject object; + auto object = std::make_unique(); MEMFD_TYPE store_fd; int64_t mmap_size; @@ -349,7 +354,7 @@ Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id, buffer.size(), &id, retry_with_request_id, - &object, + object.get(), &store_fd, &mmap_size)); if (*retry_with_request_id > 0) { @@ -359,51 +364,50 @@ Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id, } else { uint64_t unused = 0; RAY_RETURN_NOT_OK(ReadCreateReply( - buffer.data(), buffer.size(), &id, &unused, &object, &store_fd, &mmap_size)); + buffer.data(), buffer.size(), &id, &unused, object.get(), &store_fd, &mmap_size)); RAY_CHECK(unused == 0); } // If the CreateReply included an error, then the store will not send a file // descriptor. - if (object.device_num == 0) { + if (object->device_num == 0) { // The metadata should come right after the data. - RAY_CHECK(object.metadata_offset == object.data_offset + object.data_size); + RAY_CHECK(object->metadata_offset == object->data_offset + object->data_size); RAY_LOG(DEBUG) << "GetStoreFdAndMmap " << store_fd.first << ", " << store_fd.second << ", size " << mmap_size << " for object id " << id; *data = std::make_shared( shared_from_this(), - GetStoreFdAndMmap(store_fd, mmap_size) + object.data_offset, - object.data_size); + GetStoreFdAndMmap(store_fd, mmap_size) + object->data_offset, + object->data_size); // If plasma_create is being called from a transfer, then we will not copy the // metadata here. The metadata will be written along with the data streamed // from the transfer. if (metadata != NULL) { // Copy the metadata to the buffer. - memcpy((*data)->Data() + object.data_size, metadata, object.metadata_size); + memcpy((*data)->Data() + object->data_size, metadata, object->metadata_size); } } else { RAY_LOG(FATAL) << "GPU is not enabled."; } - // Increment the count of the number of instances of this object that this - // client is using. A call to PlasmaClient::Release is required to decrement - // this count. Cache the reference to the object. - IncrementObjectCount(object_id, &object, false); - // TODO(swang): Remove the second increment call. + // Add the object as in use. A call to PlasmaClient::Release is required to + // decrement the initial ref count of 1. Cache the reference to the object. + InsertObjectInUse(object_id, std::move(object), /*is_sealed=*/false); // We increment the count a second time (and the corresponding decrement will // happen in a PlasmaClient::Release call in plasma_seal) so even if the // buffer returned by PlasmaClient::Create goes out of scope, the object does // not get released before the call to PlasmaClient::Seal happens. - IncrementObjectCount(object_id, &object, false); + IncrementObjectCount(object_id); return Status::OK(); } -Status PlasmaClient::Impl::WriteAcquireMutableObject(const ObjectID &object_id, - int64_t data_size, - const uint8_t *metadata, - int64_t metadata_size, - int64_t num_readers, - std::shared_ptr *data) { +Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire( + const ObjectID &object_id, + int64_t data_size, + const uint8_t *metadata, + int64_t metadata_size, + int64_t num_readers, + std::shared_ptr *data) { std::unique_lock guard(client_mutex_); auto object_entry = objects_in_use_.find(object_id); RAY_CHECK(object_entry != objects_in_use_.end()); @@ -590,7 +594,7 @@ Status PlasmaClient::Impl::GetBuffers( object_buffers[i].device_num = object->device_num; // Increment the count of the number of instances of this object that this // client is using. Cache the reference to the object. - IncrementObjectCount(object_ids[i], object, true); + IncrementObjectCount(object_ids[i]); } } @@ -606,7 +610,7 @@ Status PlasmaClient::Impl::GetBuffers( RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaGetReply, &buffer)); std::vector received_object_ids(num_objects); std::vector object_data(num_objects); - PlasmaObject *object; + auto object = std::make_unique(); std::vector store_fds; std::vector mmap_sizes; RAY_RETURN_NOT_OK(ReadGetReply(buffer.data(), @@ -629,7 +633,7 @@ Status PlasmaClient::Impl::GetBuffers( for (int64_t i = 0; i < num_objects; ++i) { RAY_DCHECK(received_object_ids[i] == object_ids[i]); - object = &object_data[i]; + *object = object_data[i]; if (object_buffers[i].data) { // If the object was already in use by the client, then the store should // have returned it. @@ -643,7 +647,7 @@ Status PlasmaClient::Impl::GetBuffers( if (object->data_size != -1) { // Increment the count of the number of instances of this object that this // client is using. Cache the reference to the object. - IncrementObjectCount(received_object_ids[i], object, true); + InsertObjectInUse(received_object_ids[i], std::move(object), /*is_sealed=*/true); auto &object_entry = objects_in_use_[received_object_ids[i]]; // Wait for the object to become ready to read. @@ -651,22 +655,25 @@ Status PlasmaClient::Impl::GetBuffers( EnsureGetAcquired(object_entry); std::shared_ptr physical_buf; RAY_LOG(DEBUG) << "Plasma Get " << received_object_ids[i] - << ", data size: " << object->data_size - << ", metadata size: " << object->metadata_size; - if (object->device_num == 0) { - uint8_t *data = LookupMmappedFile(object->store_fd); + << ", data size: " << object_entry->object.data_size + << ", metadata size: " << object_entry->object.metadata_size; + if (object_entry->object.device_num == 0) { + uint8_t *data = LookupMmappedFile(object_entry->object.store_fd); physical_buf = std::make_shared( - data + object->data_offset, object->data_size + object->metadata_size); + data + object_entry->object.data_offset, + object_entry->object.data_size + object_entry->object.metadata_size); } else { RAY_LOG(FATAL) << "Arrow GPU library is not enabled."; } // Finish filling out the return values. physical_buf = wrap_buffer(object_ids[i], physical_buf); object_buffers[i].data = - SharedMemoryBuffer::Slice(physical_buf, 0, object->data_size); - object_buffers[i].metadata = SharedMemoryBuffer::Slice( - physical_buf, object->data_size, object->metadata_size); - object_buffers[i].device_num = object->device_num; + SharedMemoryBuffer::Slice(physical_buf, 0, object_entry->object.data_size); + object_buffers[i].metadata = + SharedMemoryBuffer::Slice(physical_buf, + object_entry->object.data_size, + object_entry->object.metadata_size); + object_buffers[i].device_num = object_entry->object.device_num; } else { // The object was not retrieved. The caller can detect this condition // by checking the boolean value of the metadata/data buffers. @@ -716,10 +723,10 @@ void PlasmaClient::Impl::EnsureGetAcquired( RAY_CHECK(object_entry->object.data_size + object_entry->object.metadata_size <= object_entry->object.allocated_size); } - return; } -Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) { +Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease( + const ObjectID &object_id) { RAY_LOG(DEBUG) << "Try to release Get for object " << object_id; std::unique_lock guard(client_mutex_); @@ -999,13 +1006,14 @@ Status PlasmaClient::Connect(const std::string &store_socket_name, store_socket_name, manager_socket_name, release_delay, num_retries); } -Status PlasmaClient::WriteAcquireMutableObject(const ObjectID &object_id, - int64_t data_size, - const uint8_t *metadata, - int64_t metadata_size, - int64_t num_readers, - std::shared_ptr *data) { - return impl_->WriteAcquireMutableObject( +Status PlasmaClient::ExperimentalMutableObjectWriteAcquire( + const ObjectID &object_id, + int64_t data_size, + const uint8_t *metadata, + int64_t metadata_size, + int64_t num_readers, + std::shared_ptr *data) { + return impl_->ExperimentalMutableObjectWriteAcquire( object_id, data_size, metadata, metadata_size, num_readers, data); } @@ -1054,8 +1062,8 @@ Status PlasmaClient::Get(const std::vector &object_ids, return impl_->Get(object_ids, timeout_ms, object_buffers, is_from_worker); } -Status PlasmaClient::GetRelease(const ObjectID &object_id) { - return impl_->GetRelease(object_id); +Status PlasmaClient::ExperimentalMutableObjectReadRelease(const ObjectID &object_id) { + return impl_->ExperimentalMutableObjectReadRelease(object_id); } Status PlasmaClient::Release(const ObjectID &object_id) { diff --git a/src/ray/object_manager/plasma/client.h b/src/ray/object_manager/plasma/client.h index 00c85cca3f11e..e3f1aa1b05e3f 100644 --- a/src/ray/object_manager/plasma/client.h +++ b/src/ray/object_manager/plasma/client.h @@ -82,7 +82,34 @@ class PlasmaClientInterface { std::vector *object_buffers, bool is_from_worker) = 0; - virtual Status GetRelease(const ObjectID &object_id) = 0; + /// Experimental method for mutable objects. Acquires a write lock on the + /// object that prevents readers from reading until we are done writing. Does + /// not protect against concurrent writers. + /// + /// \param[in] object_id The ID of the object. + /// \param[in] data_size The size of the object to write. This overwrites the + /// current data size. + /// \param[in] metadata A pointer to the object metadata buffer to copy. This + /// will overwrite the current metadata. + /// \param[in] metadata_size The number of bytes to copy from the metadata + /// pointer. + /// \param[in] num_readers The number of readers that must read and release + /// the object before the caller can write again. + /// \param[out] data The mutable object buffer in plasma that can be written to. + virtual Status ExperimentalMutableObjectWriteAcquire(const ObjectID &object_id, + int64_t data_size, + const uint8_t *metadata, + int64_t metadata_size, + int64_t num_readers, + std::shared_ptr *data) = 0; + + /// Experimental method for mutable objects. Releases the objects, allowing them + /// to be written again. If the caller did not previously Get the objects, + /// then this first blocks until the latest value is available to read, then + /// releases the value. + /// + /// \param[in] object_id The ID of the object. + virtual Status ExperimentalMutableObjectReadRelease(const ObjectID &object_id) = 0; /// Seal an object in the object store. The object will be immutable after /// this @@ -137,13 +164,6 @@ class PlasmaClientInterface { plasma::flatbuf::ObjectSource source, int device_num = 0) = 0; - virtual Status WriteAcquireMutableObject(const ObjectID &object_id, - int64_t data_size, - const uint8_t *metadata, - int64_t metadata_size, - int64_t num_readers, - std::shared_ptr *data) = 0; - /// Delete a list of objects from the object store. This currently assumes that the /// object is present, has been sealed and not used by another client. Otherwise, /// it is a no operation. @@ -211,12 +231,12 @@ class PlasmaClient : public PlasmaClientInterface { plasma::flatbuf::ObjectSource source, int device_num = 0); - Status WriteAcquireMutableObject(const ObjectID &object_id, - int64_t data_size, - const uint8_t *metadata, - int64_t metadata_size, - int64_t num_readers, - std::shared_ptr *data); + Status ExperimentalMutableObjectWriteAcquire(const ObjectID &object_id, + int64_t data_size, + const uint8_t *metadata, + int64_t metadata_size, + int64_t num_readers, + std::shared_ptr *data); /// Create an object in the Plasma Store. Any metadata for this object must be /// be passed in when the object is created. @@ -273,7 +293,7 @@ class PlasmaClient : public PlasmaClientInterface { std::vector *object_buffers, bool is_from_worker); - Status GetRelease(const ObjectID &object_id); + Status ExperimentalMutableObjectReadRelease(const ObjectID &object_id); /// Tell Plasma that the client no longer needs the object. This should be /// called after Get() or Create() when the client is done with the object. From ea57894f405c757d8faa6ed6077aecfc7c9db0cf Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Fri, 1 Dec 2023 16:09:53 -0800 Subject: [PATCH 16/66] Test for errors, better error handling when too many readers Signed-off-by: Stephanie Wang --- python/ray/_private/ray_perf.py | 10 +++-- python/ray/experimental/channel.py | 29 ++++++++------ python/ray/tests/test_accelerated_dag.py | 49 ++++++++++++++++++++++++ src/ray/object_manager/common.cc | 44 ++++++++++++--------- src/ray/object_manager/common.h | 39 ++++++++++++------- src/ray/object_manager/plasma/client.cc | 37 ++++++++++++++---- 6 files changed, 153 insertions(+), 55 deletions(-) diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py index dcc49d42b2926..330527957d675 100644 --- a/python/ray/_private/ray_perf.py +++ b/python/ray/_private/ray_perf.py @@ -296,9 +296,9 @@ def async_actor_multi(): ray.init() - def put_channel_small(chans, num_readers=1, do_get=False, do_release=False): + def put_channel_small(chans, do_get=False, do_release=False): for chan in chans: - chan.write(b"0", num_readers=num_readers) + chan.write(b"0") if do_get: chan.begin_read() if do_release: @@ -337,14 +337,14 @@ def read(self, chans): n_cpu = multiprocessing.cpu_count() // 2 print(f"Testing multiple readers/channels, n={n_cpu}") - chans = [ray_channel.Channel(1000)] + chans = [ray_channel.Channel(1000, num_readers=n_cpu)] readers = [ChannelReader.remote() for _ in range(n_cpu)] ray.get([reader.ready.remote() for reader in readers]) for reader in readers: reader.read.remote(chans) results += timeit( "local put:n remote get, single channel calls", - lambda: put_channel_small(chans, num_readers=n_cpu), + lambda: put_channel_small(chans), ) for reader in readers: ray.kill(reader) @@ -369,6 +369,8 @@ def read(self, chans): for reader in readers: ray.kill(reader) + ray.shutdown() + ############################ # End of channel perf tests. ############################ diff --git a/python/ray/experimental/channel.py b/python/ray/experimental/channel.py index 42f82e4aa3398..e8ef9ad085f79 100644 --- a/python/ray/experimental/channel.py +++ b/python/ray/experimental/channel.py @@ -52,7 +52,7 @@ class Channel: ray.wait. """ - def __init__(self, buffer_size: Optional[int] = None): + def __init__(self, buffer_size: Optional[int] = None, num_readers: int = 1): """ Create a channel that can be read and written by co-located Ray processes. @@ -71,19 +71,20 @@ def __init__(self, buffer_size: Optional[int] = None): else: self._base_ref = _create_channel_ref(buffer_size) - self.worker = ray._private.worker.global_worker - self.worker.check_connected() + self._num_readers = num_readers + self._worker = ray._private.worker.global_worker + self._worker.check_connected() @staticmethod - def _from_base_ref(base_ref: "ray.ObjectRef") -> "Channel": - chan = Channel() + def _from_base_ref(base_ref: "ray.ObjectRef", num_readers: int) -> "Channel": + chan = Channel(num_readers=num_readers) chan._base_ref = base_ref return chan def __reduce__(self): - return self._from_base_ref, (self._base_ref,) + return self._from_base_ref, (self._base_ref, self._num_readers) - def write(self, value: Any, num_readers: int): + def write(self, value: Any, num_readers: Optional[int] = None): """ Write a value to the channel. @@ -96,11 +97,13 @@ def write(self, value: Any, num_readers: int): num_readers: The number of readers that must read and release the value before we can write again. """ + if num_readers is None: + num_readers = self._num_readers if num_readers <= 0: raise ValueError("``num_readers`` must be a positive integer.") try: - serialized_value = self.worker.get_serialization_context().serialize(value) + serialized_value = self._worker.get_serialization_context().serialize(value) except TypeError as e: sio = io.StringIO() ray.util.inspect_serializability(value, print_file=sio) @@ -111,7 +114,7 @@ def write(self, value: Any, num_readers: int): ) raise TypeError(msg) from e - self.worker.core_worker.experimental_mutable_object_put_serialized( + self._worker.core_worker.experimental_mutable_object_put_serialized( serialized_value, self._base_ref, num_readers, @@ -122,10 +125,14 @@ def begin_read(self) -> Any: Read the latest value from the channel. This call will block until a value is available to read. + Subsequent calls to begin_read() will return the same value, until + end_read() is called. Then, the client must begin_read() again to get + the next value. + Returns: Any: The deserialized value. """ - values, _ = self.worker.get_objects( + values, _ = self._worker.get_objects( [self._base_ref], _is_experimental_mutable_object=True ) return values[0] @@ -137,6 +144,6 @@ def end_read(self): If begin_read is not called first, then this call will block until a value is written, then drop the value. """ - self.worker.core_worker.experimental_mutable_object_read_release( + self._worker.core_worker.experimental_mutable_object_read_release( [self._base_ref] ) diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py index 3795baaddf9fc..20e88a101efaf 100644 --- a/python/ray/tests/test_accelerated_dag.py +++ b/python/ray/tests/test_accelerated_dag.py @@ -21,9 +21,57 @@ def test_put_local_get(ray_start_regular): val = i.to_bytes(8, "little") chan.write(val, num_readers=1) assert chan.begin_read() == val + + # Begin read multiple times will return the same value. + assert chan.begin_read() == val + chan.end_read() +def test_errors(ray_start_regular): + @ray.remote + class Actor: + def make_chan(self, do_write=True): + self.chan = ray_channel.Channel(1000) + if do_write: + self.chan.write(b"hello", num_readers=1) + return self.chan + + a = Actor.remote() + # Only original creator can write. + chan = ray.get(a.make_chan.remote(do_write=False)) + with pytest.raises(ray.exceptions.RaySystemError): + chan.write(b"hi") + + # Only original creator can write. + chan = ray.get(a.make_chan.remote(do_write=True)) + assert chan.begin_read() == b"hello" + with pytest.raises(ray.exceptions.RaySystemError): + chan.write(b"hi") + + # Multiple consecutive reads from the same process are fine. + chan = ray.get(a.make_chan.remote(do_write=True)) + assert chan.begin_read() == b"hello" + assert chan.begin_read() == b"hello" + chan.end_read() + + @ray.remote + class Reader: + def __init__(self): + pass + + def read(self, chan): + return chan.begin_read() + + # Multiple reads from n different processes, where n > num_readers, errors. + chan = ray.get(a.make_chan.remote(do_write=True)) + readers = [Reader.remote(), Reader.remote()] + # At least 1 reader + with pytest.raises(ray.exceptions.RayTaskError) as exc_info: + ray.get([reader.read.remote(chan) for reader in readers]) + assert "ray.exceptions.RaySystemError" in str(exc_info.value) + + def test_put_different_meta(ray_start_regular): chan = ray_channel.Channel(1000) @@ -42,6 +90,7 @@ def _test(val): _test(1000) _test(np.random.rand(10)) + # Cannot put a serialized value larger than the allocated buffer. with pytest.raises(ValueError): _test(np.random.rand(100)) diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc index a362f784e41f7..35a21ce0e4654 100644 --- a/src/ray/object_manager/common.cc +++ b/src/ray/object_manager/common.cc @@ -56,6 +56,7 @@ void PlasmaObjectHeader::WriteAcquire(int64_t write_version, << ". Are you sure this is the only writer?"; version = write_version; + is_sealed = false; data_size = write_data_size; metadata_size = write_metadata_size; num_readers = write_num_readers; @@ -76,6 +77,7 @@ void PlasmaObjectHeader::WriteRelease(int64_t write_version) { << version << ". Are you sure this is the only writer?"; version = write_version; + is_sealed = true; RAY_CHECK(num_readers != 0) << num_readers; num_read_acquires_remaining = num_readers; num_read_releases_remaining = num_readers; @@ -87,30 +89,36 @@ void PlasmaObjectHeader::WriteRelease(int64_t write_version) { RAY_CHECK(pthread_cond_broadcast(&cond) == 0); } -int64_t PlasmaObjectHeader::ReadAcquire(int64_t read_version) { - RAY_LOG(DEBUG) << "ReadAcquire waiting version " << read_version; +bool PlasmaObjectHeader::ReadAcquire(int64_t version_to_read, int64_t *version_read) { + RAY_LOG(DEBUG) << "ReadAcquire waiting version " << version_to_read; RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0); - RAY_LOG(DEBUG) << "ReadAcquire " << read_version; + RAY_LOG(DEBUG) << "ReadAcquire " << version_to_read; PrintPlasmaObjectHeader(this); - while (version < read_version || num_read_acquires_remaining == 0) { + // Wait for the requested version (or a more recent one) to be sealed. + while (version < version_to_read || !is_sealed) { RAY_CHECK(pthread_cond_wait(&cond, &wr_mut) == 0); } - if (version > read_version) { - RAY_LOG(WARNING) << "Version " << version << " already exceeds version to read " - << read_version << ". May have missed earlier reads."; - } - - if (num_readers != -1) { - num_read_acquires_remaining--; - RAY_CHECK(num_read_acquires_remaining >= 0) - << "readers acquired exceeds max readers " << num_readers; - // This object can only be read a constant number of times. Tell the caller - // which version was read. - read_version = version; + bool success = false; + if (num_readers == -1) { + // Object is a normal immutable object. Read succeeds. + *version_read = 0; + success = true; } else { - read_version = 0; + *version_read = version; + if (version == version_to_read && num_read_acquires_remaining > 0) { + // This object is at the right version and still has reads remaining. Read + // succeeds. + num_read_acquires_remaining--; + success = true; + } else if (version > version_to_read) { + RAY_LOG(WARNING) << "Version " << version << " already exceeds version to read " + << version_to_read; + } else { + RAY_LOG(WARNING) << "Version " << version << " already has " << num_readers + << "readers"; + } } RAY_LOG(DEBUG) << "ReadAcquire done"; @@ -119,7 +127,7 @@ int64_t PlasmaObjectHeader::ReadAcquire(int64_t read_version) { RAY_CHECK(pthread_mutex_unlock(&wr_mut) == 0); // Signal to other readers that they may read. RAY_CHECK(pthread_cond_signal(&cond) == 0); - return read_version; + return success; } void PlasmaObjectHeader::ReadRelease(int64_t read_version) { diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h index 6d9a8655dd2c9..eea4ccd8eb7ba 100644 --- a/src/ray/object_manager/common.h +++ b/src/ray/object_manager/common.h @@ -57,6 +57,12 @@ struct PlasmaObjectHeader { // the first write and then should never be modified. For mutable objects, // each new write must increment the version before releasing to readers. int64_t version = 0; + // Indicates whether the current version has been written. is_sealed=false + // means that there is a writer who has WriteAcquire'd but not yet + // WriteRelease'd the current version. is_sealed=true means that `version` + // has been WriteRelease'd. A reader may read the actual object value if + // is_sealed=true and num_read_acquires_remaining != 0. + bool is_sealed = false; // The total number of reads allowed before the writer can write again. This // value should be set by the writer before releasing to readers. // For immutable objects, this is set to -1 and infinite reads are allowed. @@ -66,6 +72,10 @@ struct PlasmaObjectHeader { // objects, readers must ensure this is > 0 and decrement before they read. // Once this value reaches 0, no more readers are allowed until the writer // writes a new version. + // NOTE(swang): Technically we do not need this because + // num_read_releases_remaining protects against too many readers. However, + // this allows us to throw an error as soon as the n+1-th reader begins, + // instead of waiting to error until the n+1-th reader is done reading. int64_t num_read_acquires_remaining = 0; // The number of readers who must release the current version before a new // version can be written. For mutable objects, readers must decrement this @@ -79,13 +89,15 @@ struct PlasmaObjectHeader { uint64_t data_size = 0; uint64_t metadata_size = 0; + /// Setup synchronization primitives. void Init(); + /// Destroy synchronization primitives. void Destroy(); - /// Blocks until all readers for the previous write have ReadRelease'd the value. - /// Caller must ensure there is one writer at a time. Caller must pass - /// consecutive versions on each new write, starting with write_version=1. + /// Blocks until all readers for the previous write have ReadRelease'd the + /// value. Protects against concurrent writers. Caller must pass consecutive + /// versions on each new write, starting with write_version=1. /// /// \param write_version The new version for write. /// \param data_size The new data size of the object. @@ -96,23 +108,22 @@ struct PlasmaObjectHeader { uint64_t metadata_size, int64_t num_readers); - // Call after completing a write to signal that readers may read. - // num_readers should be set before calling this. + /// Call after completing a write to signal that readers may read. + /// num_readers should be set before calling this. /// /// \param write_version The new version for write. This must match the /// version previously passed to WriteAcquire. void WriteRelease(int64_t write_version); - // Blocks until the given version or a more recent version is ready to read. - // If num_readers have already read this version, then this call will hang. + // Blocks until the given version is ready to read. Returns false if the + // maximum number of readers have already read the requested version. // - // \param read_version The minimum version to wait for. - // \return The version that was read. This should be passed to ReadRelease - // when the reader is done. Returns 0 if the object is a normal immutable - // object, meaning no ReadRelease is needed. - /// - /// \param read_version Read at least this version. - int64_t ReadAcquire(int64_t read_version); + // \param[in] read_version The version to read. + // \param[out] version_read For normal immutable objects, this will be set to + // 0. Otherwise, the current version. + // \return success Whether the correct version was read and there were still + // reads remaining. + bool ReadAcquire(int64_t version_to_read, int64_t *version_read); // Finishes the read. If all reads are done, signals to the writer. This is // not necessary to call for objects that have num_readers=-1. diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index 1741e72b1315d..f566392368ab0 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -103,6 +103,9 @@ struct ObjectInUseEntry { /// written to after the initial Create and Seal call. Mutable objects are /// used to implement ray.experimental.channel. bool is_mutable = false; + /// Whether we are the writer. For now, only the original creator of the + /// mutable object may write to it. + bool is_writer = false; /// The last version that we read. To read again, we must pass a newer /// version than this. int64_t next_version_to_read = 1; @@ -174,7 +177,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this &object_entry); + Status EnsureGetAcquired(std::unique_ptr &object_entry); Status ExperimentalMutableObjectReadRelease(const ObjectID &object_id); @@ -410,6 +413,14 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire( std::shared_ptr *data) { std::unique_lock guard(client_mutex_); auto object_entry = objects_in_use_.find(object_id); + if (object_entry == objects_in_use_.end()) { + return Status::Invalid( + "Plasma buffer for mutable object not in scope. Are you sure you're the writer?"); + } + if (!object_entry->second->is_writer) { + return Status::Invalid( + "Mutable objects can only be written by the original creator process."); + } RAY_CHECK(object_entry != objects_in_use_.end()); auto &entry = object_entry->second; @@ -493,7 +504,9 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, entry->is_mutable = is_mutable; auto plasma_header = GetPlasmaObjectHeader(entry->object); - if (!entry->is_mutable) { + if (entry->is_mutable) { + entry->is_writer = true; + } else { // The first creation's version is always 1. RAY_CHECK(entry->next_version_to_write == 1); // The corresponding WriteRelease takes place in Seal. @@ -571,7 +584,7 @@ Status PlasmaClient::Impl::GetBuffers( all_present = false; } else { // Wait for the object to become ready to read. - EnsureGetAcquired(object_entry->second); + RAY_RETURN_NOT_OK(EnsureGetAcquired(object_entry->second)); PlasmaObject *object = &object_entry->second->object; @@ -652,7 +665,7 @@ Status PlasmaClient::Impl::GetBuffers( // Wait for the object to become ready to read. RAY_CHECK(!object_entry->read_acquired); - EnsureGetAcquired(object_entry); + RAY_RETURN_NOT_OK(EnsureGetAcquired(object_entry)); std::shared_ptr physical_buf; RAY_LOG(DEBUG) << "Plasma Get " << received_object_ids[i] << ", data size: " << object_entry->object.data_size @@ -700,15 +713,22 @@ Status PlasmaClient::Impl::Get(const std::vector &object_ids, &object_ids[0], num_objects, timeout_ms, wrap_buffer, &(*out)[0], is_from_worker); } -void PlasmaClient::Impl::EnsureGetAcquired( +Status PlasmaClient::Impl::EnsureGetAcquired( std::unique_ptr &object_entry) { PlasmaObject *object = &object_entry->object; auto plasma_header = GetPlasmaObjectHeader(*object); if (object_entry->read_acquired) { - return; + return Status::OK(); + } + + int64_t version_read = 0; + bool success = + plasma_header->ReadAcquire(object_entry->next_version_to_read, &version_read); + if (!success) { + return Status::Invalid( + "Reader missed a value. Are you sure there are num_readers many readers?"); } - int64_t version_read = plasma_header->ReadAcquire(object_entry->next_version_to_read); object_entry->read_acquired = true; if (version_read > 0) { object_entry->is_mutable = true; @@ -723,6 +743,7 @@ void PlasmaClient::Impl::EnsureGetAcquired( RAY_CHECK(object_entry->object.data_size + object_entry->object.metadata_size <= object_entry->object.allocated_size); } + return Status::OK(); } Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease( @@ -745,7 +766,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease( "ray.release() called on an object that is not mutable"); } - EnsureGetAcquired(entry); + RAY_RETURN_NOT_OK(EnsureGetAcquired(entry)); RAY_LOG(DEBUG) << "Release shared object " << object_id; auto plasma_header = GetPlasmaObjectHeader(entry->object); plasma_header->ReadRelease(entry->next_version_to_read); From 5bbf37947291473714e92b90099a60dd4a925848 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Fri, 1 Dec 2023 16:11:01 -0800 Subject: [PATCH 17/66] remove unneeded Signed-off-by: Stephanie Wang --- BUILD.bazel | 3 --- 1 file changed, 3 deletions(-) diff --git a/BUILD.bazel b/BUILD.bazel index 1f8ff15b53798..42cc658621795 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -402,9 +402,6 @@ ray_cc_library( ":plasma_client", "//src/ray/common:network", ":stats_lib", - "@boost//:asio", - "@boost//:context", - "@boost//:coroutine", ], ) From 1e16e09cdeea87baaf0daacee515d4e58d74bb29 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Fri, 1 Dec 2023 17:08:39 -0800 Subject: [PATCH 18/66] java build Signed-off-by: Stephanie Wang --- .../lib/java/io_ray_runtime_object_NativeObjectStore.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.cc b/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.cc index 955b46f746e96..72027ff27af52 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.cc +++ b/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.cc @@ -42,6 +42,7 @@ Status PutSerializedObject(JNIEnv *env, nested_ids.push_back(ObjectID::FromBinary(ref.object_id())); } status = CoreWorkerProcess::GetCoreWorker().CreateOwnedAndIncrementLocalRef( + /*is_experimental_mutable_object=*/false, native_ray_object->GetMetadata(), data_size, nested_ids, @@ -128,7 +129,10 @@ JNIEXPORT jobject JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeGet }); std::vector> results; auto status = - CoreWorkerProcess::GetCoreWorker().Get(object_ids, (int64_t)timeoutMs, &results); + CoreWorkerProcess::GetCoreWorker().Get(object_ids, + (int64_t)timeoutMs, + /*is_experimental_mutable_object=*/false, + &results); THROW_EXCEPTION_AND_RETURN_IF_NOT_OK(env, status, nullptr); return NativeVectorToJavaList>( env, results, NativeRayObjectToJavaNativeRayObject); From 580b3ad3d02ac7ca2c7f4efa42e15e7e6842b12c Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Fri, 1 Dec 2023 17:14:30 -0800 Subject: [PATCH 19/66] rename Signed-off-by: Stephanie Wang --- python/ray/tests/{test_accelerated_dag.py => test_channel.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename python/ray/tests/{test_accelerated_dag.py => test_channel.py} (100%) diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_channel.py similarity index 100% rename from python/ray/tests/test_accelerated_dag.py rename to python/ray/tests/test_channel.py From bdfbb8afe81d8e0ea9f03f0a18cc229452507fef Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Fri, 1 Dec 2023 21:45:49 -0800 Subject: [PATCH 20/66] tmp Signed-off-by: Stephanie Wang --- python/ray/dag/compiled_dag_node.py | 293 ++++++++++++++++++++++++++++ python/ray/dag/dag_node.py | 48 ++++- 2 files changed, 335 insertions(+), 6 deletions(-) create mode 100644 python/ray/dag/compiled_dag_node.py diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py new file mode 100644 index 0000000000000..88b46e3ef16ba --- /dev/null +++ b/python/ray/dag/compiled_dag_node.py @@ -0,0 +1,293 @@ +import time +import threading +from typing import List +from collections import defaultdict + +import ray +from ray.exceptions import RayTaskError, TaskCancelledError + + +MAX_BUFFER_SIZE = int(100 * 1e6) # 100MB + + +def allocate_shared_output_buffer(buffer_size_bytes: int = MAX_BUFFER_SIZE): + assert isinstance(MAX_BUFFER_SIZE, int) + ref = ray.put(b"0" * buffer_size_bytes, max_readers=1) + # TODO(swang): Sleep to make sure that the object store sees the Seal. Should + # replace this with a better call to put reusable objects, and have the object + # store ReadRelease. + time.sleep(1) + ray.release(ref) + return ref + + +def do_allocate_shared_output_buffer(self, buffer_size_bytes: int = MAX_BUFFER_SIZE): + self._output_ref = allocate_shared_output_buffer(buffer_size_bytes) + return self._output_ref + + +def do_exec_compiled_task( + self, + input_refs: List[ray.ObjectRef], + actor_method_name: str, + output_max_readers: int, +): + try: + self._input_refs = input_refs + method = getattr(self, actor_method_name) + while True: + inputs = ray.get(input_refs) + output_val = method(*inputs) + ray.worker.global_worker.put_object( + output_val, + object_ref=self._output_ref, + max_readers=output_max_readers, + ) + for input_ref in input_refs: + ray.release(input_ref) + except Exception as e: + print("Task aborted", e) + raise + + +def do_cancel_compiled_task(self): + input_refs = self._input_refs + e = RayTaskError( + function_name="do_exec_compiled_task", + traceback_str="", + cause=TaskCancelledError()) + for input_ref in self._input_refs: + print("Putting cancellation token", input_ref) + try: + ray.worker.global_worker.put_object( + e, + object_ref=input_ref, + max_readers=1, + try_wait=True, + ) + except Exception as e: + if "write acquire failed" in str(e): + pass + else: + raise + + +class CompiledTask: + """Wraps the normal Ray DAGNode with some metadata.""" + + def __init__(self, idx, dag_node: "DAGNode"): + self.idx = idx + self.dag_node = dag_node + + self.args = [] + self.dependent_node_idxs = [] + self.output_ref = None + + @property + def max_readers(self): + return len(self.dependent_node_idxs) + + def __str__(self): + return f""" +Node: {self.dag_node} +Arguments: {self.args} +Output: {self.output_ref} +""" + + +class CompiledDAG: + def __init__(self): + # idx -> CompiledTask. + self.idx_to_task = {} + # DAGNode -> idx. + self.dag_node_to_idx = {} + # idx counter. + self.counter = 0 + + self.input_task_idx = None + self.output_task_idx = None + self.node_idx_to_output_refs = {} + + # Cached. + self.dag_input_ref = None + self.dag_input_max_readers = None + self.dag_output_refs = None + self.worker_task_refs = [] + self.actor_refs = set() + + def add_node(self, node): + idx = self.counter + self.idx_to_task[idx] = CompiledTask(idx, node) + self.dag_node_to_idx[node] = idx + self.counter += 1 + + def preprocess(self): + from ray.dag import DAGNode, InputNode, OutputNode + + for idx, task in self.idx_to_task.items(): + task.args = task.dag_node.get_args() + for arg in task.args: + if isinstance(arg, DAGNode): + arg_idx = self.dag_node_to_idx[arg] + self.idx_to_task[arg_idx].dependent_node_idxs.append(idx) + if isinstance(task.dag_node, InputNode): + assert self.input_task_idx is None, "more than one InputNode found" + self.input_task_idx = idx + # TODO: Support no-input DAGs (use an empty object to signal). + assert ( + self.input_task_idx is not None + ), "no InputNode found, require exactly one" + + for idx, task in self.idx_to_task.items(): + if len(task.dependent_node_idxs) == 0: + assert ( + self.output_task_idx is None + ), "More than one output node found, make sure only one node has 0 dependent tasks" + self.output_task_idx = idx + + def compiled(self): + from ray.dag import DAGNode, InputNode, OutputNode, ClassMethodNode + + if self.dag_input_ref is not None and self.dag_output_refs is not None: + # Driver should ray.put on input, ray.get/release on output + return ( + self.dag_input_ref, + self.dag_input_max_readers, + self.dag_output_refs, + self.monitor, + ) + + queue = [self.input_task_idx] + visited = set() + # Create output buffers + while queue: + cur_idx = queue.pop(0) + if cur_idx in visited: + continue + visited.add(cur_idx) + + task = self.idx_to_task[cur_idx] + dependent_node_idxs = task.dependent_node_idxs + + # Create an output buffer on the actor. + assert task.output_ref is None + if isinstance(task.dag_node, ClassMethodNode): + fn = task.dag_node._get_remote_method("__ray_apply__") + task.output_ref = ray.get(fn.remote(do_allocate_shared_output_buffer)) + self.actor_refs.add(task.dag_node._get_actor()) + elif isinstance(task.dag_node, InputNode): + task.output_ref = allocate_shared_output_buffer() + else: + assert isinstance(task.dag_node, OutputNode) + + for idx in task.dependent_node_idxs: + queue.append(idx) + + output_node = self.idx_to_task[self.output_task_idx].dag_node + # TODO: Add an OutputNode to the end of the DAG if + # it's not already there. + assert isinstance(output_node, OutputNode) + + work_refs = [] + for node_idx, task in self.idx_to_task.items(): + if node_idx == self.input_task_idx: + # We don't need to assign an actual task for the input node. + continue + + if node_idx == self.output_task_idx: + # We don't need to assign an actual task for the input node. + continue + + resolved_args = [] + for arg in task.args: + # TODO(swang): Support non-ObjectRef args. + assert isinstance(arg, DAGNode) + arg_idx = self.dag_node_to_idx[arg] + arg_buffer = self.idx_to_task[arg_idx].output_ref + assert arg_buffer is not None + resolved_args.append(arg_buffer) + + # TODO: Assign the task with the correct input and output buffers. + worker_fn = task.dag_node._get_remote_method("__ray_apply__") + self.worker_task_refs.append( + worker_fn.options(concurrency_group="_ray_system").remote( + do_exec_compiled_task, + resolved_args, + task.dag_node.get_method_name(), + task.max_readers, + ) + ) + + self.dag_input_ref = self.idx_to_task[self.input_task_idx].output_ref + self.dag_input_max_readers = self.idx_to_task[self.input_task_idx].max_readers + + self.dag_output_refs = [] + for output in self.idx_to_task[self.output_task_idx].args: + assert isinstance(output, DAGNode) + output_idx = self.dag_node_to_idx[output] + self.dag_output_refs.append(self.idx_to_task[output_idx].output_ref) + + assert self.dag_input_ref + assert self.dag_output_refs + # Driver should ray.put on input, ray.get/release on output + self.monitor = self.monitor_failures() + return (self.dag_input_ref, self.dag_input_max_readers, self.dag_output_refs, self.monitor) + + def monitor_failures(self): + outer = self + + class Monitor(threading.Thread): + def __init__(self): + super().__init__(daemon=True) + self.in_destroy = False + + def destroy(self): + if self.in_destroy: + return + self.in_destroy = True + for actor in outer.actor_refs: + print("Cancelling compiled worker on actor", actor) + try: + ray.get(actor.__ray_apply__.remote(do_cancel_compiled_task)) + except Exception as e: + print("Error cancelling", e) + pass + + def run(self): + try: + ray.get(outer.worker_task_refs) + except Exception as e: + if self.in_destroy: + return + print("Worker task exception", e) + for output_ref in outer.dag_output_refs: + print("Putting error", output_ref) + try: + ray.worker.global_worker.put_object( + e, + object_ref=output_ref, + max_readers=1, + try_wait=True, + ) + except Exception as f: + if "write acquire failed" in str(f): + pass + else: + raise + self.destroy() + + monitor = Monitor() + monitor.start() + return monitor + + +def build_compiled_dag(dag: "DAGNode"): + compiled_dag = CompiledDAG() + + def build_compiled_dag(node): + compiled_dag.add_node(node) + return node + + dag.apply_recursive(build_compiled_dag) + compiled_dag.preprocess() + return compiled_dag diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py index 6041a12401855..5a1f2251438ac 100644 --- a/python/ray/dag/dag_node.py +++ b/python/ray/dag/dag_node.py @@ -16,6 +16,8 @@ import uuid import asyncio +from ray.dag.compiled_dag_node import build_compiled_dag + T = TypeVar("T") @@ -59,6 +61,8 @@ def __init__( # Cached values from last call to execute() self.cache_from_last_execute = {} + self._compiled_dag = None + def get_args(self) -> Tuple[Any]: """Return the tuple of arguments for this node.""" @@ -103,8 +107,19 @@ async def get_object_refs_from_last_execute(self) -> Dict[str, Any]: def clear_cache(self): self.cache_from_last_execute = {} + def compiled(self) -> Tuple[ray.ObjectRef]: + if self._compiled_dag is None: + self._compiled_dag = build_compiled_dag(self) + + return self._compiled_dag.compiled() + def execute( - self, *args, _ray_cache_refs: bool = False, **kwargs + self, + *args, + _ray_cache_refs: bool = False, + _ray_cache_actors: bool = True, + compiled: bool = False, + **kwargs, ) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]: """Execute this DAG using the Ray default executor _execute_impl(). @@ -115,15 +130,33 @@ def execute( - Serve handles for class nodes - resolved values representing user input at runtime """ + if compiled: + assert len(args) == 1, "Compiled DAGs support exactly one InputNode arg" + input_ref, input_max_readers, output_ref, _ = self.compiled() + ray.worker.global_worker.put_object( + args[0], object_ref=input_ref, max_readers=input_max_readers + ) + return output_ref def executor(node): return node._execute_impl(*args, **kwargs) - result = self.apply_recursive(executor) + cache = {} if _ray_cache_refs: + cache = self.cache_from_last_execute + elif _ray_cache_actors: + for key, ref in self.cache_from_last_execute.items(): + if isinstance(ref, ray.actor.ActorHandle): + cache[key] = ref + result = self.apply_recursive(executor, cache=cache) + if _ray_cache_refs or _ray_cache_actors: self.cache_from_last_execute = executor.cache return result + def destroy_compiled_dag(self): + _, _, _, monitor = self.compiled() + monitor.destroy() + def _get_toplevel_child_nodes(self) -> List["DAGNode"]: """Return the list of nodes specified as top-level args. @@ -218,7 +251,7 @@ def _apply_and_replace_all_child_nodes( new_args, new_kwargs, self.get_options(), new_other_args_to_resolve ) - def apply_recursive(self, fn: "Callable[[DAGNode], T]") -> T: + def apply_recursive(self, fn: "Callable[[DAGNode], T]", cache=None) -> T: """Apply callable on each node in this DAG in a bottom-up tree walk. Args: @@ -231,8 +264,11 @@ def apply_recursive(self, fn: "Callable[[DAGNode], T]") -> T: """ class _CachingFn: - def __init__(self, fn): - self.cache = {} + def __init__(self, fn, cache=None): + if cache is None: + self.cache = {} + else: + self.cache = cache self.fn = fn self.fn.cache = self.cache self.input_node_uuid = None @@ -250,7 +286,7 @@ def __call__(self, node): return self.cache[node._stable_uuid] if not type(fn).__name__ == "_CachingFn": - fn = _CachingFn(fn) + fn = _CachingFn(fn, cache) return fn( self._apply_and_replace_all_child_nodes( From fe11cc36216fe37327122fb4390bdf5489206a26 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Fri, 1 Dec 2023 21:50:00 -0800 Subject: [PATCH 21/66] test metadata change in remote reader Signed-off-by: Stephanie Wang --- python/ray/tests/test_channel.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/python/ray/tests/test_channel.py b/python/ray/tests/test_channel.py index 20e88a101efaf..0bd008593a110 100644 --- a/python/ray/tests/test_channel.py +++ b/python/ray/tests/test_channel.py @@ -112,6 +112,19 @@ def read(self, chan, num_writes): assert chan.begin_read() == val chan.end_read() + for i in range(num_writes): + val = i.to_bytes(100, "little") + assert chan.begin_read() == val + chan.end_read() + + for val in [ + b"hello world", + "hello again", + 1000, + ]: + assert chan.begin_read() == val + chan.end_read() + num_writes = 1000 readers = [Reader.remote() for _ in range(num_readers)] done = [reader.read.remote(chan, num_writes) for reader in readers] @@ -119,6 +132,19 @@ def read(self, chan, num_writes): val = i.to_bytes(8, "little") chan.write(val, num_readers=num_readers) + # Test different data size. + for i in range(num_writes): + val = i.to_bytes(100, "little") + chan.write(val, num_readers=num_readers) + + # Test different metadata. + for val in [ + b"hello world", + "hello again", + 1000, + ]: + chan.write(val, num_readers=num_readers) + ray.get(done) From e11b6146849f68f01b1f9e84121ae5647599d67f Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Mon, 4 Dec 2023 13:55:32 -0800 Subject: [PATCH 22/66] build Signed-off-by: Stephanie Wang --- cpp/src/ray/runtime/object/native_object_store.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/ray/runtime/object/native_object_store.cc b/cpp/src/ray/runtime/object/native_object_store.cc index b65159a9c83fd..6d1a14ae120f9 100644 --- a/cpp/src/ray/runtime/object/native_object_store.cc +++ b/cpp/src/ray/runtime/object/native_object_store.cc @@ -91,7 +91,8 @@ std::vector> NativeObjectStore::GetRaw( const std::vector &ids, int timeout_ms) { auto &core_worker = CoreWorkerProcess::GetCoreWorker(); std::vector> results; - ::ray::Status status = core_worker.Get(ids, timeout_ms, &results); + ::ray::Status status = core_worker.Get( + ids, timeout_ms, /*is_experimental_mutable_object=*/false, &results); if (!status.ok()) { if (status.IsTimedOut()) { throw RayTimeoutException("Get object error:" + status.message()); From b6150a3ebcafed9995b24bd78974842e5badfffe Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Mon, 4 Dec 2023 22:22:33 -0800 Subject: [PATCH 23/66] scatter-gather DAG works Signed-off-by: Stephanie Wang --- python/ray/dag/class_node.py | 4 + python/ray/dag/compiled_dag_node.py | 192 ++++++++-------------------- python/ray/dag/dag_node.py | 8 +- 3 files changed, 63 insertions(+), 141 deletions(-) diff --git a/python/ray/dag/class_node.py b/python/ray/dag/class_node.py index a474ffa10c553..8daf406dd2e1c 100644 --- a/python/ray/dag/class_node.py +++ b/python/ray/dag/class_node.py @@ -193,3 +193,7 @@ def __str__(self) -> str: def get_method_name(self) -> str: return self._method_name + + def _get_remote_method(self, method_name): + method_body = getattr(self._parent_class_node, method_name) + return method_body diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index 88b46e3ef16ba..74ce83c91a917 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -1,97 +1,69 @@ -import time -import threading from typing import List -from collections import defaultdict import ray -from ray.exceptions import RayTaskError, TaskCancelledError +import ray.experimental.channel as ray_channel MAX_BUFFER_SIZE = int(100 * 1e6) # 100MB -def allocate_shared_output_buffer(buffer_size_bytes: int = MAX_BUFFER_SIZE): - assert isinstance(MAX_BUFFER_SIZE, int) - ref = ray.put(b"0" * buffer_size_bytes, max_readers=1) - # TODO(swang): Sleep to make sure that the object store sees the Seal. Should - # replace this with a better call to put reusable objects, and have the object - # store ReadRelease. - time.sleep(1) - ray.release(ref) - return ref +def allocate_channel(buffer_size_bytes: int = MAX_BUFFER_SIZE, num_readers: int = 1): + if not isinstance(buffer_size_bytes, int): + raise ValueError("buffer_size_bytes must be an integer") + if not isinstance(num_readers, int): + raise ValueError("num_readers must be an integer") + return ray_channel.Channel(buffer_size_bytes, num_readers) -def do_allocate_shared_output_buffer(self, buffer_size_bytes: int = MAX_BUFFER_SIZE): - self._output_ref = allocate_shared_output_buffer(buffer_size_bytes) - return self._output_ref + +def do_allocate_channel( + self, buffer_size_bytes: int = MAX_BUFFER_SIZE, num_readers: int = 1 +): + self._output_channel = allocate_channel(buffer_size_bytes) + return self._output_channel def do_exec_compiled_task( self, - input_refs: List[ray.ObjectRef], + input_channels: List["ray_channel.Channel"], actor_method_name: str, - output_max_readers: int, ): try: - self._input_refs = input_refs + self._input_channels = input_channels method = getattr(self, actor_method_name) while True: - inputs = ray.get(input_refs) + inputs = [chan.begin_read() for chan in input_channels] output_val = method(*inputs) - ray.worker.global_worker.put_object( - output_val, - object_ref=self._output_ref, - max_readers=output_max_readers, - ) - for input_ref in input_refs: - ray.release(input_ref) + + self._output_channel.write(output_val) + for chan in input_channels: + chan.end_read() + except Exception as e: print("Task aborted", e) raise -def do_cancel_compiled_task(self): - input_refs = self._input_refs - e = RayTaskError( - function_name="do_exec_compiled_task", - traceback_str="", - cause=TaskCancelledError()) - for input_ref in self._input_refs: - print("Putting cancellation token", input_ref) - try: - ray.worker.global_worker.put_object( - e, - object_ref=input_ref, - max_readers=1, - try_wait=True, - ) - except Exception as e: - if "write acquire failed" in str(e): - pass - else: - raise - - class CompiledTask: """Wraps the normal Ray DAGNode with some metadata.""" - def __init__(self, idx, dag_node: "DAGNode"): + def __init__(self, idx, dag_node: "ray.dag.DAGNode"): self.idx = idx self.dag_node = dag_node self.args = [] self.dependent_node_idxs = [] - self.output_ref = None + self.output_channel = None @property - def max_readers(self): + def num_readers(self): return len(self.dependent_node_idxs) def __str__(self): return f""" Node: {self.dag_node} Arguments: {self.args} -Output: {self.output_ref} +Output: {self.output_channel} """ @@ -106,14 +78,12 @@ def __init__(self): self.input_task_idx = None self.output_task_idx = None - self.node_idx_to_output_refs = {} + self.node_idx_to_output_channels = {} # Cached. self.dag_input_ref = None - self.dag_input_max_readers = None - self.dag_output_refs = None + self.dag_output_channels = None self.worker_task_refs = [] - self.actor_refs = set() def add_node(self, node): idx = self.counter @@ -122,7 +92,7 @@ def add_node(self, node): self.counter += 1 def preprocess(self): - from ray.dag import DAGNode, InputNode, OutputNode + from ray.dag import DAGNode, InputNode for idx, task in self.idx_to_task.items(): task.args = task.dag_node.get_args() @@ -140,21 +110,20 @@ def preprocess(self): for idx, task in self.idx_to_task.items(): if len(task.dependent_node_idxs) == 0: - assert ( - self.output_task_idx is None - ), "More than one output node found, make sure only one node has 0 dependent tasks" + assert self.output_task_idx is None, ( + "More than one output node found, " + "make sure only one node has 0 dependent tasks" + ) self.output_task_idx = idx def compiled(self): from ray.dag import DAGNode, InputNode, OutputNode, ClassMethodNode - if self.dag_input_ref is not None and self.dag_output_refs is not None: + if self.dag_input_ref is not None and self.dag_output_channels is not None: # Driver should ray.put on input, ray.get/release on output return ( self.dag_input_ref, - self.dag_input_max_readers, - self.dag_output_refs, - self.monitor, + self.dag_output_channels, ) queue = [self.input_task_idx] @@ -167,16 +136,18 @@ def compiled(self): visited.add(cur_idx) task = self.idx_to_task[cur_idx] - dependent_node_idxs = task.dependent_node_idxs - # Create an output buffer on the actor. - assert task.output_ref is None + assert task.output_channel is None if isinstance(task.dag_node, ClassMethodNode): - fn = task.dag_node._get_remote_method("__ray_apply__") - task.output_ref = ray.get(fn.remote(do_allocate_shared_output_buffer)) - self.actor_refs.add(task.dag_node._get_actor()) + fn = task.dag_node._get_remote_method("__ray_call__") + task.output_channel = ray.get( + fn.remote( + do_allocate_channel, + num_readers=task.num_readers, + ) + ) elif isinstance(task.dag_node, InputNode): - task.output_ref = allocate_shared_output_buffer() + task.output_channel = allocate_channel(num_readers=task.num_readers) else: assert isinstance(task.dag_node, OutputNode) @@ -188,7 +159,6 @@ def compiled(self): # it's not already there. assert isinstance(output_node, OutputNode) - work_refs = [] for node_idx, task in self.idx_to_task.items(): if node_idx == self.input_task_idx: # We don't need to assign an actual task for the input node. @@ -203,91 +173,41 @@ def compiled(self): # TODO(swang): Support non-ObjectRef args. assert isinstance(arg, DAGNode) arg_idx = self.dag_node_to_idx[arg] - arg_buffer = self.idx_to_task[arg_idx].output_ref + arg_buffer = self.idx_to_task[arg_idx].output_channel assert arg_buffer is not None resolved_args.append(arg_buffer) # TODO: Assign the task with the correct input and output buffers. - worker_fn = task.dag_node._get_remote_method("__ray_apply__") + worker_fn = task.dag_node._get_remote_method("__ray_call__") self.worker_task_refs.append( - worker_fn.options(concurrency_group="_ray_system").remote( + worker_fn.remote( do_exec_compiled_task, resolved_args, task.dag_node.get_method_name(), - task.max_readers, ) ) - self.dag_input_ref = self.idx_to_task[self.input_task_idx].output_ref - self.dag_input_max_readers = self.idx_to_task[self.input_task_idx].max_readers + self.dag_input_ref = self.idx_to_task[self.input_task_idx].output_channel - self.dag_output_refs = [] + self.dag_output_channels = [] for output in self.idx_to_task[self.output_task_idx].args: assert isinstance(output, DAGNode) output_idx = self.dag_node_to_idx[output] - self.dag_output_refs.append(self.idx_to_task[output_idx].output_ref) + self.dag_output_channels.append(self.idx_to_task[output_idx].output_channel) assert self.dag_input_ref - assert self.dag_output_refs + assert self.dag_output_channels # Driver should ray.put on input, ray.get/release on output - self.monitor = self.monitor_failures() - return (self.dag_input_ref, self.dag_input_max_readers, self.dag_output_refs, self.monitor) - - def monitor_failures(self): - outer = self - - class Monitor(threading.Thread): - def __init__(self): - super().__init__(daemon=True) - self.in_destroy = False - - def destroy(self): - if self.in_destroy: - return - self.in_destroy = True - for actor in outer.actor_refs: - print("Cancelling compiled worker on actor", actor) - try: - ray.get(actor.__ray_apply__.remote(do_cancel_compiled_task)) - except Exception as e: - print("Error cancelling", e) - pass - - def run(self): - try: - ray.get(outer.worker_task_refs) - except Exception as e: - if self.in_destroy: - return - print("Worker task exception", e) - for output_ref in outer.dag_output_refs: - print("Putting error", output_ref) - try: - ray.worker.global_worker.put_object( - e, - object_ref=output_ref, - max_readers=1, - try_wait=True, - ) - except Exception as f: - if "write acquire failed" in str(f): - pass - else: - raise - self.destroy() - - monitor = Monitor() - monitor.start() - return monitor - - -def build_compiled_dag(dag: "DAGNode"): + return (self.dag_input_ref, self.dag_output_channels) + + +def build_compiled_dag(dag: "ray.dag.DAGNode"): compiled_dag = CompiledDAG() - def build_compiled_dag(node): + def _build_compiled_dag(node): compiled_dag.add_node(node) return node - dag.apply_recursive(build_compiled_dag) + dag.apply_recursive(_build_compiled_dag) compiled_dag.preprocess() return compiled_dag diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py index 5a1f2251438ac..983db7ed35b5d 100644 --- a/python/ray/dag/dag_node.py +++ b/python/ray/dag/dag_node.py @@ -132,11 +132,9 @@ def execute( """ if compiled: assert len(args) == 1, "Compiled DAGs support exactly one InputNode arg" - input_ref, input_max_readers, output_ref, _ = self.compiled() - ray.worker.global_worker.put_object( - args[0], object_ref=input_ref, max_readers=input_max_readers - ) - return output_ref + input_ref, output_channels = self.compiled() + input_ref.write(args[0]) + return output_channels def executor(node): return node._execute_impl(*args, **kwargs) From 533626268b64e029b505b90365c37d7afe47f826 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 5 Dec 2023 10:40:40 -0800 Subject: [PATCH 24/66] fix Signed-off-by: Stephanie Wang --- src/ray/object_manager/plasma/client.cc | 12 ++++++------ src/ray/object_manager/plasma/object_store.cc | 1 - 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index f566392368ab0..53f3eba9e1583 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -876,12 +876,12 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) { object_entry->second->is_sealed = true; auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object); - if (plasma_header->num_readers != 0) { - plasma_header->WriteRelease( - /*write_version=*/object_entry->second->next_version_to_write); - // The next Write must pass a higher version. - object_entry->second->next_version_to_write++; - } else { + plasma_header->WriteRelease( + /*write_version=*/object_entry->second->next_version_to_write); + // The next Write must pass a higher version. + object_entry->second->next_version_to_write++; + + if (plasma_header->num_readers <= 0) { // Send the seal request to Plasma. This is the normal Seal path, used for // immutable objects and the initial Create call for mutable objects. RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id)); diff --git a/src/ray/object_manager/plasma/object_store.cc b/src/ray/object_manager/plasma/object_store.cc index 8f4178dc9b797..0615e2b5ad023 100644 --- a/src/ray/object_manager/plasma/object_store.cc +++ b/src/ray/object_manager/plasma/object_store.cc @@ -82,7 +82,6 @@ bool ObjectStore::DeleteObject(const ObjectID &object_id) { if (entry == nullptr) { return false; } - // TODO(swang): Make sure Seal coroutine is done before deleting. auto plasma_header = entry->GetPlasmaObjectHeader(); plasma_header->Destroy(); From 99a38c2068a01e96a4e03469dbcd385ba92e94c9 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 5 Dec 2023 10:40:40 -0800 Subject: [PATCH 25/66] fix Signed-off-by: Stephanie Wang --- src/ray/object_manager/plasma/client.cc | 12 ++++++------ src/ray/object_manager/plasma/object_store.cc | 1 - 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index f566392368ab0..53f3eba9e1583 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -876,12 +876,12 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) { object_entry->second->is_sealed = true; auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object); - if (plasma_header->num_readers != 0) { - plasma_header->WriteRelease( - /*write_version=*/object_entry->second->next_version_to_write); - // The next Write must pass a higher version. - object_entry->second->next_version_to_write++; - } else { + plasma_header->WriteRelease( + /*write_version=*/object_entry->second->next_version_to_write); + // The next Write must pass a higher version. + object_entry->second->next_version_to_write++; + + if (plasma_header->num_readers <= 0) { // Send the seal request to Plasma. This is the normal Seal path, used for // immutable objects and the initial Create call for mutable objects. RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id)); diff --git a/src/ray/object_manager/plasma/object_store.cc b/src/ray/object_manager/plasma/object_store.cc index 8f4178dc9b797..0615e2b5ad023 100644 --- a/src/ray/object_manager/plasma/object_store.cc +++ b/src/ray/object_manager/plasma/object_store.cc @@ -82,7 +82,6 @@ bool ObjectStore::DeleteObject(const ObjectID &object_id) { if (entry == nullptr) { return false; } - // TODO(swang): Make sure Seal coroutine is done before deleting. auto plasma_header = entry->GetPlasmaObjectHeader(); plasma_header->Destroy(); From e88c40f37ccc5059a17059d5d628c3e2436725a0 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 5 Dec 2023 16:44:10 -0800 Subject: [PATCH 26/66] fix Signed-off-by: Stephanie Wang --- src/ray/object_manager/plasma/client.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index 53f3eba9e1583..99caa590180f3 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -623,7 +623,6 @@ Status PlasmaClient::Impl::GetBuffers( RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaGetReply, &buffer)); std::vector received_object_ids(num_objects); std::vector object_data(num_objects); - auto object = std::make_unique(); std::vector store_fds; std::vector mmap_sizes; RAY_RETURN_NOT_OK(ReadGetReply(buffer.data(), @@ -644,9 +643,10 @@ Status PlasmaClient::Impl::GetBuffers( GetStoreFdAndMmap(store_fds[i], mmap_sizes[i]); } + std::unique_ptr object; for (int64_t i = 0; i < num_objects; ++i) { RAY_DCHECK(received_object_ids[i] == object_ids[i]); - *object = object_data[i]; + object = std::make_unique(object_data[i]); if (object_buffers[i].data) { // If the object was already in use by the client, then the store should // have returned it. From 95e871b441c25c89249650e287034cab9b3c957a Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 5 Dec 2023 17:00:45 -0800 Subject: [PATCH 27/66] compile? Signed-off-by: Stephanie Wang --- src/ray/object_manager/common.cc | 32 +++++++++++++++---------- src/ray/object_manager/common.h | 17 ++++++++----- src/ray/object_manager/plasma/client.cc | 28 +++++++++++++++------- 3 files changed, 50 insertions(+), 27 deletions(-) diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc index 35a21ce0e4654..e35a2807942a8 100644 --- a/src/ray/object_manager/common.cc +++ b/src/ray/object_manager/common.cc @@ -2,19 +2,8 @@ namespace ray { -void PrintPlasmaObjectHeader(const PlasmaObjectHeader *header) { - RAY_LOG(DEBUG) << "PlasmaObjectHeader: \n" - << "version: " << header->version << "\n" - << "num_readers: " << header->num_readers << "\n" - << "num_read_acquires_remaining: " << header->num_read_acquires_remaining - << "\n" - << "num_read_releases_remaining: " << header->num_read_releases_remaining - << "\n" - << "data_size: " << header->data_size << "\n" - << "metadata_size: " << header->metadata_size << "\n"; -} - void PlasmaObjectHeader::Init() { +#ifndef _WIN32 // wr_mut is shared between writer and readers. pthread_mutexattr_t mutex_attr; pthread_mutexattr_init(&mutex_attr); @@ -29,12 +18,29 @@ void PlasmaObjectHeader::Init() { pthread_condattr_init(&cond_attr); pthread_condattr_setpshared(&cond_attr, PTHREAD_PROCESS_SHARED); pthread_cond_init(&cond, &cond_attr); +#endif } void PlasmaObjectHeader::Destroy() { +#ifndef _WIN32 RAY_CHECK(pthread_mutex_destroy(&wr_mut) == 0); RAY_CHECK(pthread_cond_destroy(&cond) == 0); RAY_CHECK(sem_destroy(&rw_semaphore) == 0); +#endif +} + +#ifndef _WIN32 + +void PrintPlasmaObjectHeader(const PlasmaObjectHeader *header) { + RAY_LOG(DEBUG) << "PlasmaObjectHeader: \n" + << "version: " << header->version << "\n" + << "num_readers: " << header->num_readers << "\n" + << "num_read_acquires_remaining: " << header->num_read_acquires_remaining + << "\n" + << "num_read_releases_remaining: " << header->num_read_releases_remaining + << "\n" + << "data_size: " << header->data_size << "\n" + << "metadata_size: " << header->metadata_size << "\n"; } void PlasmaObjectHeader::WriteAcquire(int64_t write_version, @@ -156,4 +162,6 @@ void PlasmaObjectHeader::ReadRelease(int64_t read_version) { } } +#endif + } // namespace ray diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h index eea4ccd8eb7ba..4a4d8404a94d5 100644 --- a/src/ray/object_manager/common.h +++ b/src/ray/object_manager/common.h @@ -14,7 +14,9 @@ #pragma once +#ifndef _WIN32 #include +#endif #include #include @@ -46,6 +48,8 @@ using RestoreSpilledObjectCallback = /// needed once the object has been Sealed. For experimental mutable objects, /// we use the header to synchronize between writer and readers. struct PlasmaObjectHeader { +// TODO(swang): PlasmaObjectHeader uses pthreads, POSIX mutex and semaphore. +#ifndef _WIN32 // Used to signal to the writer when all readers are done. sem_t rw_semaphore; @@ -89,12 +93,6 @@ struct PlasmaObjectHeader { uint64_t data_size = 0; uint64_t metadata_size = 0; - /// Setup synchronization primitives. - void Init(); - - /// Destroy synchronization primitives. - void Destroy(); - /// Blocks until all readers for the previous write have ReadRelease'd the /// value. Protects against concurrent writers. Caller must pass consecutive /// versions on each new write, starting with write_version=1. @@ -131,6 +129,13 @@ struct PlasmaObjectHeader { /// \param read_version This must match the version previously passed in /// ReadAcquire. void ReadRelease(int64_t read_version); +#endif + + /// Setup synchronization primitives. + void Init(); + + /// Destroy synchronization primitives. + void Destroy(); }; /// A struct that includes info about the object. diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index 99caa590180f3..f394c014b7a75 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -233,7 +233,13 @@ class PlasmaClient::Impl : public std::enable_shared_from_this(header_ptr); + } +#endif void InsertObjectInUse(const ObjectID &object_id, std::unique_ptr object, @@ -303,13 +309,6 @@ uint8_t *PlasmaClient::Impl::LookupMmappedFile(MEMFD_TYPE store_fd_val) const { return entry->second->pointer(); } -ray::PlasmaObjectHeader *PlasmaClient::Impl::GetPlasmaObjectHeader( - const PlasmaObject &object) const { - auto base_ptr = LookupMmappedFile(object.store_fd); - auto header_ptr = base_ptr + object.header_offset; - return reinterpret_cast(header_ptr); -} - bool PlasmaClient::Impl::IsInUse(const ObjectID &object_id) { std::lock_guard guard(client_mutex_); @@ -411,6 +410,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire( int64_t metadata_size, int64_t num_readers, std::shared_ptr *data) { +#ifndef _WIN32 std::unique_lock guard(client_mutex_); auto object_entry = objects_in_use_.find(object_id); if (object_entry == objects_in_use_.end()) { @@ -455,6 +455,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire( } entry->is_sealed = false; +#endif return Status::OK(); } @@ -503,6 +504,7 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, RAY_CHECK(!entry->is_sealed); entry->is_mutable = is_mutable; +#ifndef _WIN32 auto plasma_header = GetPlasmaObjectHeader(entry->object); if (entry->is_mutable) { entry->is_writer = true; @@ -518,6 +520,7 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, // Anyone may read an immutable object. /*num_readers=*/-1); } +#endif } return status; @@ -715,6 +718,7 @@ Status PlasmaClient::Impl::Get(const std::vector &object_ids, Status PlasmaClient::Impl::EnsureGetAcquired( std::unique_ptr &object_entry) { +#ifndef _WIN32 PlasmaObject *object = &object_entry->object; auto plasma_header = GetPlasmaObjectHeader(*object); if (object_entry->read_acquired) { @@ -743,11 +747,13 @@ Status PlasmaClient::Impl::EnsureGetAcquired( RAY_CHECK(object_entry->object.data_size + object_entry->object.metadata_size <= object_entry->object.allocated_size); } +#endif return Status::OK(); } Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease( const ObjectID &object_id) { +#ifndef _WIN32 RAY_LOG(DEBUG) << "Try to release Get for object " << object_id; std::unique_lock guard(client_mutex_); @@ -773,7 +779,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease( // The next read needs to read at least this version. entry->next_version_to_read++; entry->read_acquired = false; - +#endif return Status::OK(); } @@ -875,6 +881,7 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) { } object_entry->second->is_sealed = true; +#ifndef _WIN32 auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object); plasma_header->WriteRelease( /*write_version=*/object_entry->second->next_version_to_write); @@ -882,6 +889,9 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) { object_entry->second->next_version_to_write++; if (plasma_header->num_readers <= 0) { +#else + { +#endif // Send the seal request to Plasma. This is the normal Seal path, used for // immutable objects and the initial Create call for mutable objects. RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id)); From 204bb9bdc3e5514d821acfc129a32357a8963a5c Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 5 Dec 2023 16:44:10 -0800 Subject: [PATCH 28/66] fix Signed-off-by: Stephanie Wang --- src/ray/object_manager/plasma/client.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index 53f3eba9e1583..99caa590180f3 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -623,7 +623,6 @@ Status PlasmaClient::Impl::GetBuffers( RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaGetReply, &buffer)); std::vector received_object_ids(num_objects); std::vector object_data(num_objects); - auto object = std::make_unique(); std::vector store_fds; std::vector mmap_sizes; RAY_RETURN_NOT_OK(ReadGetReply(buffer.data(), @@ -644,9 +643,10 @@ Status PlasmaClient::Impl::GetBuffers( GetStoreFdAndMmap(store_fds[i], mmap_sizes[i]); } + std::unique_ptr object; for (int64_t i = 0; i < num_objects; ++i) { RAY_DCHECK(received_object_ids[i] == object_ids[i]); - *object = object_data[i]; + object = std::make_unique(object_data[i]); if (object_buffers[i].data) { // If the object was already in use by the client, then the store should // have returned it. From 4703f34eed5909b8c088636f444d992ef9a3c1e7 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 5 Dec 2023 17:00:45 -0800 Subject: [PATCH 29/66] compile? Signed-off-by: Stephanie Wang --- src/ray/object_manager/common.cc | 32 +++++++++++++++---------- src/ray/object_manager/common.h | 17 ++++++++----- src/ray/object_manager/plasma/client.cc | 28 +++++++++++++++------- 3 files changed, 50 insertions(+), 27 deletions(-) diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc index 35a21ce0e4654..e35a2807942a8 100644 --- a/src/ray/object_manager/common.cc +++ b/src/ray/object_manager/common.cc @@ -2,19 +2,8 @@ namespace ray { -void PrintPlasmaObjectHeader(const PlasmaObjectHeader *header) { - RAY_LOG(DEBUG) << "PlasmaObjectHeader: \n" - << "version: " << header->version << "\n" - << "num_readers: " << header->num_readers << "\n" - << "num_read_acquires_remaining: " << header->num_read_acquires_remaining - << "\n" - << "num_read_releases_remaining: " << header->num_read_releases_remaining - << "\n" - << "data_size: " << header->data_size << "\n" - << "metadata_size: " << header->metadata_size << "\n"; -} - void PlasmaObjectHeader::Init() { +#ifndef _WIN32 // wr_mut is shared between writer and readers. pthread_mutexattr_t mutex_attr; pthread_mutexattr_init(&mutex_attr); @@ -29,12 +18,29 @@ void PlasmaObjectHeader::Init() { pthread_condattr_init(&cond_attr); pthread_condattr_setpshared(&cond_attr, PTHREAD_PROCESS_SHARED); pthread_cond_init(&cond, &cond_attr); +#endif } void PlasmaObjectHeader::Destroy() { +#ifndef _WIN32 RAY_CHECK(pthread_mutex_destroy(&wr_mut) == 0); RAY_CHECK(pthread_cond_destroy(&cond) == 0); RAY_CHECK(sem_destroy(&rw_semaphore) == 0); +#endif +} + +#ifndef _WIN32 + +void PrintPlasmaObjectHeader(const PlasmaObjectHeader *header) { + RAY_LOG(DEBUG) << "PlasmaObjectHeader: \n" + << "version: " << header->version << "\n" + << "num_readers: " << header->num_readers << "\n" + << "num_read_acquires_remaining: " << header->num_read_acquires_remaining + << "\n" + << "num_read_releases_remaining: " << header->num_read_releases_remaining + << "\n" + << "data_size: " << header->data_size << "\n" + << "metadata_size: " << header->metadata_size << "\n"; } void PlasmaObjectHeader::WriteAcquire(int64_t write_version, @@ -156,4 +162,6 @@ void PlasmaObjectHeader::ReadRelease(int64_t read_version) { } } +#endif + } // namespace ray diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h index eea4ccd8eb7ba..4a4d8404a94d5 100644 --- a/src/ray/object_manager/common.h +++ b/src/ray/object_manager/common.h @@ -14,7 +14,9 @@ #pragma once +#ifndef _WIN32 #include +#endif #include #include @@ -46,6 +48,8 @@ using RestoreSpilledObjectCallback = /// needed once the object has been Sealed. For experimental mutable objects, /// we use the header to synchronize between writer and readers. struct PlasmaObjectHeader { +// TODO(swang): PlasmaObjectHeader uses pthreads, POSIX mutex and semaphore. +#ifndef _WIN32 // Used to signal to the writer when all readers are done. sem_t rw_semaphore; @@ -89,12 +93,6 @@ struct PlasmaObjectHeader { uint64_t data_size = 0; uint64_t metadata_size = 0; - /// Setup synchronization primitives. - void Init(); - - /// Destroy synchronization primitives. - void Destroy(); - /// Blocks until all readers for the previous write have ReadRelease'd the /// value. Protects against concurrent writers. Caller must pass consecutive /// versions on each new write, starting with write_version=1. @@ -131,6 +129,13 @@ struct PlasmaObjectHeader { /// \param read_version This must match the version previously passed in /// ReadAcquire. void ReadRelease(int64_t read_version); +#endif + + /// Setup synchronization primitives. + void Init(); + + /// Destroy synchronization primitives. + void Destroy(); }; /// A struct that includes info about the object. diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index 99caa590180f3..f394c014b7a75 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -233,7 +233,13 @@ class PlasmaClient::Impl : public std::enable_shared_from_this(header_ptr); + } +#endif void InsertObjectInUse(const ObjectID &object_id, std::unique_ptr object, @@ -303,13 +309,6 @@ uint8_t *PlasmaClient::Impl::LookupMmappedFile(MEMFD_TYPE store_fd_val) const { return entry->second->pointer(); } -ray::PlasmaObjectHeader *PlasmaClient::Impl::GetPlasmaObjectHeader( - const PlasmaObject &object) const { - auto base_ptr = LookupMmappedFile(object.store_fd); - auto header_ptr = base_ptr + object.header_offset; - return reinterpret_cast(header_ptr); -} - bool PlasmaClient::Impl::IsInUse(const ObjectID &object_id) { std::lock_guard guard(client_mutex_); @@ -411,6 +410,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire( int64_t metadata_size, int64_t num_readers, std::shared_ptr *data) { +#ifndef _WIN32 std::unique_lock guard(client_mutex_); auto object_entry = objects_in_use_.find(object_id); if (object_entry == objects_in_use_.end()) { @@ -455,6 +455,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire( } entry->is_sealed = false; +#endif return Status::OK(); } @@ -503,6 +504,7 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, RAY_CHECK(!entry->is_sealed); entry->is_mutable = is_mutable; +#ifndef _WIN32 auto plasma_header = GetPlasmaObjectHeader(entry->object); if (entry->is_mutable) { entry->is_writer = true; @@ -518,6 +520,7 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, // Anyone may read an immutable object. /*num_readers=*/-1); } +#endif } return status; @@ -715,6 +718,7 @@ Status PlasmaClient::Impl::Get(const std::vector &object_ids, Status PlasmaClient::Impl::EnsureGetAcquired( std::unique_ptr &object_entry) { +#ifndef _WIN32 PlasmaObject *object = &object_entry->object; auto plasma_header = GetPlasmaObjectHeader(*object); if (object_entry->read_acquired) { @@ -743,11 +747,13 @@ Status PlasmaClient::Impl::EnsureGetAcquired( RAY_CHECK(object_entry->object.data_size + object_entry->object.metadata_size <= object_entry->object.allocated_size); } +#endif return Status::OK(); } Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease( const ObjectID &object_id) { +#ifndef _WIN32 RAY_LOG(DEBUG) << "Try to release Get for object " << object_id; std::unique_lock guard(client_mutex_); @@ -773,7 +779,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease( // The next read needs to read at least this version. entry->next_version_to_read++; entry->read_acquired = false; - +#endif return Status::OK(); } @@ -875,6 +881,7 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) { } object_entry->second->is_sealed = true; +#ifndef _WIN32 auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object); plasma_header->WriteRelease( /*write_version=*/object_entry->second->next_version_to_write); @@ -882,6 +889,9 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) { object_entry->second->next_version_to_write++; if (plasma_header->num_readers <= 0) { +#else + { +#endif // Send the seal request to Plasma. This is the normal Seal path, used for // immutable objects and the initial Create call for mutable objects. RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id)); From 420bd1c060fadb676e228890900890db29a1c538 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 5 Dec 2023 21:20:50 -0800 Subject: [PATCH 30/66] build Signed-off-by: Stephanie Wang --- python/ray/tests/BUILD | 1 + src/ray/object_manager/plasma/object_store.cc | 2 ++ 2 files changed, 3 insertions(+) diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index 6560b38950af6..3a48b4edd888a 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -249,6 +249,7 @@ py_test_module_list( "test_annotations.py", "test_args.py", "test_asyncio_cluster.py", + "test_channel.py", "test_concurrency_group.py", "test_component_failures.py", "test_cross_language.py", diff --git a/src/ray/object_manager/plasma/object_store.cc b/src/ray/object_manager/plasma/object_store.cc index 0615e2b5ad023..a9324ad39d90f 100644 --- a/src/ray/object_manager/plasma/object_store.cc +++ b/src/ray/object_manager/plasma/object_store.cc @@ -71,9 +71,11 @@ const LocalObject *ObjectStore::SealObject(const ObjectID &object_id) { entry->state = ObjectState::PLASMA_SEALED; entry->construct_duration = std::time(nullptr) - entry->create_time; auto plasma_header = entry->GetPlasmaObjectHeader(); +#ifndef _WIN32 if (!entry->object_info.is_mutable) { RAY_CHECK(plasma_header->num_readers == -1) << plasma_header->num_readers; } +#endif return entry; } From 4cabbc59f684d8bfaab97616b61229b228dc6cdb Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 6 Dec 2023 08:54:16 -0800 Subject: [PATCH 31/66] x Signed-off-by: Stephanie Wang --- src/ray/object_manager/plasma/client.cc | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index f394c014b7a75..13b90b44df050 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -661,14 +661,19 @@ Status PlasmaClient::Impl::GetBuffers( // If we are here, the object was not currently in use, so we need to // process the reply from the object store. if (object->data_size != -1) { - // Increment the count of the number of instances of this object that this - // client is using. Cache the reference to the object. - InsertObjectInUse(received_object_ids[i], std::move(object), /*is_sealed=*/true); + if (objects_in_use_.find(received_object_ids[i]) == objects_in_use_.end()) { + // Increment the count of the number of instances of this object that this + // client is using. Cache the reference to the object. + InsertObjectInUse(received_object_ids[i], std::move(object), /*is_sealed=*/true); + } else { + IncrementObjectCount(received_object_ids[i]); + } auto &object_entry = objects_in_use_[received_object_ids[i]]; // Wait for the object to become ready to read. - RAY_CHECK(!object_entry->read_acquired); - RAY_RETURN_NOT_OK(EnsureGetAcquired(object_entry)); + if (!object_entry->read_acquired) { + RAY_RETURN_NOT_OK(EnsureGetAcquired(object_entry)); + } std::shared_ptr physical_buf; RAY_LOG(DEBUG) << "Plasma Get " << received_object_ids[i] << ", data size: " << object_entry->object.data_size From b44ef8ae5d7edb40e1b233abdf145e476c088d3a Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 6 Dec 2023 09:24:49 -0800 Subject: [PATCH 32/66] fix Signed-off-by: Stephanie Wang --- src/ray/object_manager/plasma/client.cc | 72 ++++++++++++++----------- 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index 13b90b44df050..d06f073c91a01 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -146,6 +146,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this *data); @@ -204,6 +205,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this *data); @@ -341,6 +343,7 @@ void PlasmaClient::Impl::IncrementObjectCount(const ObjectID &object_id) { } Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id, + bool is_mutable, const uint8_t *metadata, uint64_t *retry_with_request_id, std::shared_ptr *data) { @@ -400,6 +403,32 @@ Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id, // buffer returned by PlasmaClient::Create goes out of scope, the object does // not get released before the call to PlasmaClient::Seal happens. IncrementObjectCount(object_id); + + // Create IPC was successful. + auto object_entry = objects_in_use_.find(object_id); + RAY_CHECK(object_entry != objects_in_use_.end()); + auto &entry = object_entry->second; + RAY_CHECK(!entry->is_sealed); + entry->is_mutable = is_mutable; + +#ifndef _WIN32 + auto plasma_header = GetPlasmaObjectHeader(entry->object); + if (entry->is_mutable) { + entry->is_writer = true; + } else { + // The first creation's version is always 1. + RAY_CHECK(entry->next_version_to_write == 1); + // The corresponding WriteRelease takes place in Seal. + // When an object is first created, the data size is equivalent to + // buffer size. + plasma_header->WriteAcquire(entry->next_version_to_write, + entry->object.data_size, + entry->object.metadata_size, + // Anyone may read an immutable object. + /*num_readers=*/-1); + } +#endif + return Status::OK(); } @@ -482,7 +511,8 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, source, device_num, /*try_immediately=*/false)); - Status status = HandleCreateReply(object_id, metadata, &retry_with_request_id, data); + Status status = + HandleCreateReply(object_id, is_mutable, metadata, &retry_with_request_id, data); while (retry_with_request_id > 0) { guard.unlock(); @@ -492,35 +522,12 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, guard.lock(); RAY_LOG(DEBUG) << "Retrying request for object " << object_id << " with request ID " << retry_with_request_id; - status = RetryCreate( - object_id, retry_with_request_id, metadata, &retry_with_request_id, data); - } - - if (status.ok()) { - // Create IPC was successful. - auto object_entry = objects_in_use_.find(object_id); - RAY_CHECK(object_entry != objects_in_use_.end()); - auto &entry = object_entry->second; - RAY_CHECK(!entry->is_sealed); - entry->is_mutable = is_mutable; - -#ifndef _WIN32 - auto plasma_header = GetPlasmaObjectHeader(entry->object); - if (entry->is_mutable) { - entry->is_writer = true; - } else { - // The first creation's version is always 1. - RAY_CHECK(entry->next_version_to_write == 1); - // The corresponding WriteRelease takes place in Seal. - // When an object is first created, the data size is equivalent to - // buffer size. - plasma_header->WriteAcquire(entry->next_version_to_write, - data_size, - metadata_size, - // Anyone may read an immutable object. - /*num_readers=*/-1); - } -#endif + status = RetryCreate(object_id, + retry_with_request_id, + is_mutable, + metadata, + &retry_with_request_id, + data); } return status; @@ -528,12 +535,13 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, Status PlasmaClient::Impl::RetryCreate(const ObjectID &object_id, uint64_t request_id, + bool is_mutable, const uint8_t *metadata, uint64_t *retry_with_request_id, std::shared_ptr *data) { std::lock_guard guard(client_mutex_); RAY_RETURN_NOT_OK(SendCreateRetryRequest(store_conn_, object_id, request_id)); - return HandleCreateReply(object_id, metadata, retry_with_request_id, data); + return HandleCreateReply(object_id, is_mutable, metadata, retry_with_request_id, data); } Status PlasmaClient::Impl::TryCreateImmediately(const ObjectID &object_id, @@ -557,7 +565,7 @@ Status PlasmaClient::Impl::TryCreateImmediately(const ObjectID &object_id, source, device_num, /*try_immediately=*/true)); - return HandleCreateReply(object_id, metadata, nullptr, data); + return HandleCreateReply(object_id, /*is_mutable=*/false, metadata, nullptr, data); } Status PlasmaClient::Impl::GetBuffers( From e54972b99872450c902a3d97d9889a2f290f1457 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 6 Dec 2023 09:40:20 -0800 Subject: [PATCH 33/66] unit test Signed-off-by: Stephanie Wang --- python/ray/dag/BUILD | 8 ++++ python/ray/dag/tests/test_accelerated_dag.py | 50 ++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 python/ray/dag/tests/test_accelerated_dag.py diff --git a/python/ray/dag/BUILD b/python/ray/dag/BUILD index 53e61563231ff..f5eea4e6155b5 100644 --- a/python/ray/dag/BUILD +++ b/python/ray/dag/BUILD @@ -60,3 +60,11 @@ py_test( tags = ["exclusive", "team:core", "ray_dag_tests"], deps = [":dag_lib"], ) + +py_test( + name = "test_accelerated_dag", + size = "small", + srcs = dag_tests_srcs, + tags = ["exclusive", "team:core", "ray_dag_tests"], + deps = [":dag_lib"], +) diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py new file mode 100644 index 0000000000000..b07491c1f18c6 --- /dev/null +++ b/python/ray/dag/tests/test_accelerated_dag.py @@ -0,0 +1,50 @@ +# coding: utf-8 +import logging +import os +import sys + +import numpy as np +import pytest + +import ray +import ray.cluster_utils +import ray.experimental.channel as ray_channel +from ray.dag import DAGNode, InputNode, OutputNode + + +logger = logging.getLogger(__name__) + + +@ray.remote(concurrency_groups={"_ray_system": 1}) +class Actor: + def __init__(self, init_value): + print("__init__ PID", os.getpid()) + self.i = init_value + + def inc(self, x): + self.i += x + return self.i + + +@pytest.mark.parametrize("num_actors", [1, 4]) +def test_scatter_gather_dag(ray_start_regular, num_actors): + init_val = 0 + actors = [Actor.remote(init_val) for _ in range(num_actors)] + with InputNode() as i: + out = [a.inc.bind(i) for a in actors] + dag = OutputNode(out) + + for i in range(3): + output_channels = dag.execute(1, compiled=True) + # TODO(swang): Replace with fake ObjectRef. + results = [chan.begin_read() for chan in output_channels] + assert results == [init_val + i + 1] * num_actors + for chan in output_channels: + chan.end_read() + + +if __name__ == "__main__": + if os.environ.get("PARALLEL_CI"): + sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__])) + else: + sys.exit(pytest.main(["-sv", __file__])) From 881d5ff6aa807c40a381b4833c41d060914cbb86 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 6 Dec 2023 14:08:18 -0800 Subject: [PATCH 34/66] copyright Signed-off-by: Stephanie Wang --- src/ray/object_manager/common.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc index e35a2807942a8..cb3335f9cf8c9 100644 --- a/src/ray/object_manager/common.cc +++ b/src/ray/object_manager/common.cc @@ -1,3 +1,17 @@ +// Copyright 2020-2021 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "ray/object_manager/common.h" namespace ray { From ef2cfb7bf5376aed1801a30a75c97629cbda82ac Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 6 Dec 2023 14:18:54 -0800 Subject: [PATCH 35/66] test Signed-off-by: Stephanie Wang --- python/ray/tests/test_object_store_metrics.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/ray/tests/test_object_store_metrics.py b/python/ray/tests/test_object_store_metrics.py index 60922a888dc47..7e72919e761b4 100644 --- a/python/ray/tests/test_object_store_metrics.py +++ b/python/ray/tests/test_object_store_metrics.py @@ -92,7 +92,7 @@ def test_shared_memory_and_inline_worker_heap(shutdown_only): wait_for_condition( # 1KiB for metadata difference - lambda: approx_eq_dict_in(objects_by_loc(info), expected, 1 * KiB), + lambda: approx_eq_dict_in(objects_by_loc(info), expected, 2 * KiB), timeout=20, retry_interval_ms=500, ) @@ -134,7 +134,7 @@ def func(): wait_for_condition( # 1KiB for metadata difference - lambda: approx_eq_dict_in(objects_by_loc(info), expected, 1 * KiB), + lambda: approx_eq_dict_in(objects_by_loc(info), expected, 2 * KiB), timeout=20, retry_interval_ms=500, ) @@ -255,7 +255,7 @@ def test_fallback_memory(shutdown_only): wait_for_condition( # 2KiB for metadata difference - lambda: approx_eq_dict_in(objects_by_loc(info), expected, 2 * KiB), + lambda: approx_eq_dict_in(objects_by_loc(info), expected, 3 * KiB), timeout=20, retry_interval_ms=500, ) @@ -282,8 +282,8 @@ def test_fallback_memory(shutdown_only): } wait_for_condition( - # 1KiB for metadata difference - lambda: approx_eq_dict_in(objects_by_loc(info), expected, 2 * KiB), + # 3KiB for metadata difference + lambda: approx_eq_dict_in(objects_by_loc(info), expected, 3 * KiB), timeout=20, retry_interval_ms=500, ) @@ -302,8 +302,8 @@ def test_fallback_memory(shutdown_only): } wait_for_condition( - # 1KiB for metadata difference - lambda: approx_eq_dict_in(objects_by_loc(info), expected, 2 * KiB), + # 3KiB for metadata difference + lambda: approx_eq_dict_in(objects_by_loc(info), expected, 3 * KiB), timeout=20, retry_interval_ms=500, ) @@ -333,7 +333,7 @@ def test_seal_memory(shutdown_only): wait_for_condition( # 1KiB for metadata difference - lambda: approx_eq_dict_in(objects_by_seal_state(info), expected, 1 * KiB), + lambda: approx_eq_dict_in(objects_by_seal_state(info), expected, 2 * KiB), timeout=20, retry_interval_ms=500, ) @@ -347,7 +347,7 @@ def test_seal_memory(shutdown_only): wait_for_condition( # 1KiB for metadata difference - lambda: approx_eq_dict_in(objects_by_seal_state(info), expected, 1 * KiB), + lambda: approx_eq_dict_in(objects_by_seal_state(info), expected, 2 * KiB), timeout=20, retry_interval_ms=500, ) From 93968107d6de00336ca5af23a3c95e9f655ddc89 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 6 Dec 2023 14:55:32 -0800 Subject: [PATCH 36/66] tmp Signed-off-by: Stephanie Wang --- python/ray/_raylet.pyx | 7 +++++++ python/ray/experimental/channel.py | 4 ++++ src/ray/core_worker/core_worker.cc | 4 ++++ 3 files changed, 15 insertions(+) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 6495d46a57f7c..5044f09ab388e 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -3508,6 +3508,13 @@ cdef class CoreWorker: generator_id=CObjectID.Nil(), owner_address=null_owner_address)) + def experimental_mutable_object_get_current_output_ref(self, ObjectRef channel_ref): + cdef: + CObjectID c_channel_ref = channel_ref + with nogil: + return (CCoreWorkerProcess.GetCoreWorker() + .ExperimentalMutableObjectGetCurrentOutputRef(c_channel_ref)) + def experimental_mutable_object_read_release(self, object_refs): """ For experimental.channel.Channel. diff --git a/python/ray/experimental/channel.py b/python/ray/experimental/channel.py index e8ef9ad085f79..11f5e36053822 100644 --- a/python/ray/experimental/channel.py +++ b/python/ray/experimental/channel.py @@ -120,6 +120,10 @@ def write(self, value: Any, num_readers: Optional[int] = None): num_readers, ) + def get_current_output_ref(self) -> "ray.ObjectRef": + return self._worker.core_worker.experimental_mutable_object_get_current_output_ref( + self._base_ref) + def begin_read(self) -> Any: """ Read the latest value from the channel. This call will block until a diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 57000d2b3abbc..cd0d5a38e5607 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1307,6 +1307,10 @@ Status CoreWorker::CreateOwnedAndIncrementLocalRef( memory_store_->Put(RayObject(rpc::ErrorType::OBJECT_IN_PLASMA), *object_id)); } } + + if (is_experimental_mutable_object) { + RegisterChannel(*object_id); + } return Status::OK(); } From dbbb3d6b89927eb8f1fbcc78c0b60317f0b58d89 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 6 Dec 2023 16:39:31 -0800 Subject: [PATCH 37/66] Only allocate PlasmaObjectHeader if is_mutable=true Signed-off-by: Stephanie Wang --- src/ray/object_manager/common.h | 4 ++-- src/ray/object_manager/plasma/common.h | 10 +++++++--- src/ray/object_manager/plasma/object_store.cc | 20 +++++++++---------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h index 4a4d8404a94d5..e3e8381de9330 100644 --- a/src/ray/object_manager/common.h +++ b/src/ray/object_manager/common.h @@ -141,7 +141,7 @@ struct PlasmaObjectHeader { /// A struct that includes info about the object. struct ObjectInfo { ObjectID object_id; - bool is_mutable; + bool is_mutable = false; int64_t data_size = 0; int64_t metadata_size = 0; /// Owner's raylet ID. @@ -154,7 +154,7 @@ struct ObjectInfo { WorkerID owner_worker_id; int64_t GetObjectSize() const { - return sizeof(PlasmaObjectHeader) + data_size + metadata_size; + return data_size + metadata_size + (is_mutable ? sizeof(PlasmaObjectHeader) : 0); } bool operator==(const ObjectInfo &other) const { diff --git a/src/ray/object_manager/plasma/common.h b/src/ray/object_manager/plasma/common.h index d24a110c32e44..414af54b4a544 100644 --- a/src/ray/object_manager/plasma/common.h +++ b/src/ray/object_manager/plasma/common.h @@ -124,6 +124,7 @@ class LocalObject { const plasma::flatbuf::ObjectSource &GetSource() const { return source; } ray::PlasmaObjectHeader *GetPlasmaObjectHeader() const { + RAY_CHECK(object_info.is_mutable) << "Object is not mutable"; auto header_ptr = static_cast(allocation.address); return reinterpret_cast(header_ptr); } @@ -135,9 +136,12 @@ class LocalObject { } object->store_fd = GetAllocation().fd; object->header_offset = GetAllocation().offset; - object->data_offset = GetAllocation().offset + sizeof(ray::PlasmaObjectHeader); - object->metadata_offset = GetAllocation().offset + sizeof(ray::PlasmaObjectHeader) + - GetObjectInfo().data_size; + object->data_offset = GetAllocation().offset; + object->metadata_offset = GetAllocation().offset + GetObjectInfo().data_size; + if (object_info.is_mutable) { + object->data_offset += sizeof(ray::PlasmaObjectHeader); + object->metadata_offset += sizeof(ray::PlasmaObjectHeader); + }; object->data_size = GetObjectInfo().data_size; object->metadata_size = GetObjectInfo().metadata_size; // Senders and receivers of a channel may store different data and metadata diff --git a/src/ray/object_manager/plasma/object_store.cc b/src/ray/object_manager/plasma/object_store.cc index a9324ad39d90f..4262a282f3fa9 100644 --- a/src/ray/object_manager/plasma/object_store.cc +++ b/src/ray/object_manager/plasma/object_store.cc @@ -47,9 +47,11 @@ const LocalObject *ObjectStore::CreateObject(const ray::ObjectInfo &object_info, entry->construct_duration = -1; entry->source = source; - auto plasma_header = entry->GetPlasmaObjectHeader(); - *plasma_header = ray::PlasmaObjectHeader{}; - plasma_header->Init(); + if (object_info.is_mutable) { + auto plasma_header = entry->GetPlasmaObjectHeader(); + *plasma_header = ray::PlasmaObjectHeader{}; + plasma_header->Init(); + } RAY_LOG(DEBUG) << "create object " << object_info.object_id << " succeeded"; return entry; @@ -70,12 +72,6 @@ const LocalObject *ObjectStore::SealObject(const ObjectID &object_id) { } entry->state = ObjectState::PLASMA_SEALED; entry->construct_duration = std::time(nullptr) - entry->create_time; - auto plasma_header = entry->GetPlasmaObjectHeader(); -#ifndef _WIN32 - if (!entry->object_info.is_mutable) { - RAY_CHECK(plasma_header->num_readers == -1) << plasma_header->num_readers; - } -#endif return entry; } @@ -84,8 +80,10 @@ bool ObjectStore::DeleteObject(const ObjectID &object_id) { if (entry == nullptr) { return false; } - auto plasma_header = entry->GetPlasmaObjectHeader(); - plasma_header->Destroy(); + if (entry->object_info.is_mutable) { + auto plasma_header = entry->GetPlasmaObjectHeader(); + plasma_header->Destroy(); + } allocator_.Free(std::move(entry->allocation)); object_table_.erase(object_id); From 9078776a704335e4646e48c565f6be4a0eb8382c Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 6 Dec 2023 18:34:35 -0800 Subject: [PATCH 38/66] Only call Read/Write Acquire/Release if is_mutable=true Signed-off-by: Stephanie Wang --- python/ray/_raylet.pyx | 9 +- python/ray/includes/libcoreworker.pxd | 2 + src/ray/core_worker/core_worker.cc | 4 + src/ray/core_worker/core_worker.h | 7 + .../store_provider/plasma_store_provider.cc | 5 + .../store_provider/plasma_store_provider.h | 7 + src/ray/object_manager/plasma/client.cc | 174 +++++++++--------- src/ray/object_manager/plasma/client.h | 9 + src/ray/object_manager/plasma/common.h | 1 + src/ray/object_manager/plasma/plasma.fbs | 3 + src/ray/object_manager/plasma/plasma.h | 1 + src/ray/object_manager/plasma/protocol.cc | 10 +- 12 files changed, 137 insertions(+), 95 deletions(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 6495d46a57f7c..595986851ed7c 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -3502,11 +3502,10 @@ cdef class CoreWorker: if data_size > 0: (serialized_object).write_to( Buffer.make(data)) - check_status( - CCoreWorkerProcess.GetCoreWorker().SealExisting( - c_object_id, pin_object=False, - generator_id=CObjectID.Nil(), - owner_address=null_owner_address)) + check_status(CCoreWorkerProcess.GetCoreWorker() + .ExperimentalMutableObjectWriteRelease( + c_object_id, + )) def experimental_mutable_object_read_release(self, object_refs): """ diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index 00bf9b5f9d4e6..ab2fb6c1f8f5a 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -246,6 +246,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: uint64_t data_size, int64_t num_readers, shared_ptr[CBuffer] *data) + CRayStatus ExperimentalMutableObjectWriteRelease( + const CObjectID &object_id) CRayStatus SealOwned(const CObjectID &object_id, c_bool pin_object, const unique_ptr[CAddress] &owner_address) CRayStatus SealExisting(const CObjectID &object_id, c_bool pin_object, diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 57000d2b3abbc..ce1ebe57de8e5 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1336,6 +1336,10 @@ Status CoreWorker::ExperimentalMutableObjectWriteAcquire( object_id, metadata, data_size, num_readers, data); } +Status CoreWorker::ExperimentalMutableObjectWriteRelease(const ObjectID &object_id) { + return plasma_store_provider_->ExperimentalMutableObjectWriteRelease(object_id); +} + Status CoreWorker::SealOwned(const ObjectID &object_id, bool pin_object, const std::unique_ptr &owner_address) { diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index ea01d202fba75..606868b380f29 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -698,6 +698,13 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { int64_t num_readers, std::shared_ptr *data); + /// Experimental method for mutable objects. Releases a write lock on the + /// object, allowing readers to read. This is the equivalent of "Seal" for + /// normal objects. + /// + /// \param[in] object_id The ID of the object. + Status ExperimentalMutableObjectWriteRelease(const ObjectID &object_id); + /// Experimental method for mutable objects. Releases the objects, allowing them /// to be written again. If the caller did not previously Get the objects, /// then this first blocks until the latest value is available to read, then diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.cc b/src/ray/core_worker/store_provider/plasma_store_provider.cc index 9d22c256355fc..d4c97b9e0ef01 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.cc +++ b/src/ray/core_worker/store_provider/plasma_store_provider.cc @@ -123,6 +123,11 @@ Status CoreWorkerPlasmaStoreProvider::ExperimentalMutableObjectWriteAcquire( data); } +Status CoreWorkerPlasmaStoreProvider::ExperimentalMutableObjectWriteRelease( + const ObjectID &object_id) { + return store_client_.ExperimentalMutableObjectWriteRelease(object_id); +} + Status CoreWorkerPlasmaStoreProvider::Create(const std::shared_ptr &metadata, const size_t data_size, const ObjectID &object_id, diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.h b/src/ray/core_worker/store_provider/plasma_store_provider.h index fff93c48c2e4a..3656655623828 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.h +++ b/src/ray/core_worker/store_provider/plasma_store_provider.h @@ -200,6 +200,13 @@ class CoreWorkerPlasmaStoreProvider { int64_t num_readers, std::shared_ptr *data); + /// Experimental method for mutable objects. Releases a write lock on the + /// object, allowing readers to read. This is the equivalent of "Seal" for + /// normal objects. + /// + /// \param[in] object_id The ID of the object. + Status ExperimentalMutableObjectWriteRelease(const ObjectID &object_id); + /// Experimental method for mutable objects. Releases the objects, allowing them /// to be written again. If the caller did not previously Get the objects, /// then this first blocks until the latest value is available to read, then diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc index d06f073c91a01..36ea5ba8bd3d1 100644 --- a/src/ray/object_manager/plasma/client.cc +++ b/src/ray/object_manager/plasma/client.cc @@ -98,11 +98,6 @@ struct ObjectInUseEntry { /// The below fields are experimental and used to implement /// ray.experimental.channel. - /// - /// Whether the object is mutable. Most objects are immutable and cannot be - /// written to after the initial Create and Seal call. Mutable objects are - /// used to implement ray.experimental.channel. - bool is_mutable = false; /// Whether we are the writer. For now, only the original creator of the /// mutable object may write to it. bool is_writer = false; @@ -136,7 +131,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this *data); @@ -167,6 +162,8 @@ class PlasmaClient::Impl : public std::enable_shared_from_this *data); + Status ExperimentalMutableObjectWriteRelease(const ObjectID &object_id); + Status Get(const std::vector &object_ids, int64_t timeout_ms, std::vector *object_buffers, @@ -205,7 +202,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this *data); @@ -235,13 +232,11 @@ class PlasmaClient::Impl : public std::enable_shared_from_this(header_ptr); } -#endif void InsertObjectInUse(const ObjectID &object_id, std::unique_ptr object, @@ -343,7 +338,7 @@ void PlasmaClient::Impl::IncrementObjectCount(const ObjectID &object_id) { } Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id, - bool is_mutable, + bool is_experimental_mutable_object, const uint8_t *metadata, uint64_t *retry_with_request_id, std::shared_ptr *data) { @@ -409,25 +404,7 @@ Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id, RAY_CHECK(object_entry != objects_in_use_.end()); auto &entry = object_entry->second; RAY_CHECK(!entry->is_sealed); - entry->is_mutable = is_mutable; - -#ifndef _WIN32 - auto plasma_header = GetPlasmaObjectHeader(entry->object); - if (entry->is_mutable) { - entry->is_writer = true; - } else { - // The first creation's version is always 1. - RAY_CHECK(entry->next_version_to_write == 1); - // The corresponding WriteRelease takes place in Seal. - // When an object is first created, the data size is equivalent to - // buffer size. - plasma_header->WriteAcquire(entry->next_version_to_write, - entry->object.data_size, - entry->object.metadata_size, - // Anyone may read an immutable object. - /*num_readers=*/-1); - } -#endif + entry->is_writer = true; return Status::OK(); } @@ -453,7 +430,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire( RAY_CHECK(object_entry != objects_in_use_.end()); auto &entry = object_entry->second; - RAY_CHECK(entry->is_mutable); + RAY_CHECK(entry->object.is_experimental_mutable_object); RAY_CHECK(entry->is_sealed) << "Must Seal before writing again to a mutable object"; RAY_LOG(DEBUG) << "Write mutable object " << object_id; @@ -488,9 +465,39 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire( return Status::OK(); } +Status PlasmaClient::Impl::ExperimentalMutableObjectWriteRelease( + const ObjectID &object_id) { +#ifndef _WIN32 + std::unique_lock guard(client_mutex_); + auto object_entry = objects_in_use_.find(object_id); + if (object_entry == objects_in_use_.end()) { + return Status::Invalid( + "Plasma buffer for mutable object not in scope. Are you sure you're the writer?"); + } + if (!object_entry->second->is_writer) { + return Status::Invalid( + "Mutable objects can only be written by the original creator process."); + } + RAY_CHECK(object_entry != objects_in_use_.end()); + + auto &entry = object_entry->second; + RAY_CHECK(entry->object.is_experimental_mutable_object); + RAY_CHECK(!entry->is_sealed) + << "Must WriteAcquire before WriteRelease on a mutable object"; + + entry->is_sealed = true; + auto plasma_header = GetPlasmaObjectHeader(entry->object); + plasma_header->WriteRelease( + /*write_version=*/entry->next_version_to_write); + // The next Write must pass a higher version. + entry->next_version_to_write++; +#endif + return Status::OK(); +} + Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, const ray::rpc::Address &owner_address, - bool is_mutable, + bool is_experimental_mutable_object, int64_t data_size, const uint8_t *metadata, int64_t metadata_size, @@ -505,14 +512,14 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, RAY_RETURN_NOT_OK(SendCreateRequest(store_conn_, object_id, owner_address, - is_mutable, + is_experimental_mutable_object, data_size, metadata_size, source, device_num, /*try_immediately=*/false)); - Status status = - HandleCreateReply(object_id, is_mutable, metadata, &retry_with_request_id, data); + Status status = HandleCreateReply( + object_id, is_experimental_mutable_object, metadata, &retry_with_request_id, data); while (retry_with_request_id > 0) { guard.unlock(); @@ -524,7 +531,7 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, << retry_with_request_id; status = RetryCreate(object_id, retry_with_request_id, - is_mutable, + is_experimental_mutable_object, metadata, &retry_with_request_id, data); @@ -535,13 +542,14 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id, Status PlasmaClient::Impl::RetryCreate(const ObjectID &object_id, uint64_t request_id, - bool is_mutable, + bool is_experimental_mutable_object, const uint8_t *metadata, uint64_t *retry_with_request_id, std::shared_ptr *data) { std::lock_guard guard(client_mutex_); RAY_RETURN_NOT_OK(SendCreateRetryRequest(store_conn_, object_id, request_id)); - return HandleCreateReply(object_id, is_mutable, metadata, retry_with_request_id, data); + return HandleCreateReply( + object_id, is_experimental_mutable_object, metadata, retry_with_request_id, data); } Status PlasmaClient::Impl::TryCreateImmediately(const ObjectID &object_id, @@ -559,13 +567,14 @@ Status PlasmaClient::Impl::TryCreateImmediately(const ObjectID &object_id, RAY_RETURN_NOT_OK(SendCreateRequest(store_conn_, object_id, owner_address, - /*is_mutable=*/false, + /*is_experimental_mutable_object=*/false, data_size, metadata_size, source, device_num, /*try_immediately=*/true)); - return HandleCreateReply(object_id, /*is_mutable=*/false, metadata, nullptr, data); + return HandleCreateReply( + object_id, /*is_experimental_mutable_object=*/false, metadata, nullptr, data); } Status PlasmaClient::Impl::GetBuffers( @@ -594,8 +603,10 @@ Status PlasmaClient::Impl::GetBuffers( << "Attempting to get an object that this client created but hasn't sealed."; all_present = false; } else { - // Wait for the object to become ready to read. - RAY_RETURN_NOT_OK(EnsureGetAcquired(object_entry->second)); + if (object_entry->second->object.is_experimental_mutable_object) { + // Wait for the object to become ready to read. + RAY_RETURN_NOT_OK(EnsureGetAcquired(object_entry->second)); + } PlasmaObject *object = &object_entry->second->object; @@ -679,7 +690,7 @@ Status PlasmaClient::Impl::GetBuffers( auto &object_entry = objects_in_use_[received_object_ids[i]]; // Wait for the object to become ready to read. - if (!object_entry->read_acquired) { + if (object_entry->object.is_experimental_mutable_object) { RAY_RETURN_NOT_OK(EnsureGetAcquired(object_entry)); } std::shared_ptr physical_buf; @@ -733,6 +744,7 @@ Status PlasmaClient::Impl::EnsureGetAcquired( std::unique_ptr &object_entry) { #ifndef _WIN32 PlasmaObject *object = &object_entry->object; + RAY_CHECK(object->is_experimental_mutable_object); auto plasma_header = GetPlasmaObjectHeader(*object); if (object_entry->read_acquired) { return Status::OK(); @@ -747,19 +759,17 @@ Status PlasmaClient::Impl::EnsureGetAcquired( } object_entry->read_acquired = true; - if (version_read > 0) { - object_entry->is_mutable = true; - object_entry->next_version_to_read = version_read; - - // The data and metadata size may have changed, so update here before we - // create the Get buffer to return. - object_entry->object.data_size = plasma_header->data_size; - object_entry->object.metadata_size = plasma_header->metadata_size; - object_entry->object.metadata_offset = - object_entry->object.data_offset + object_entry->object.data_size; - RAY_CHECK(object_entry->object.data_size + object_entry->object.metadata_size <= - object_entry->object.allocated_size); - } + RAY_CHECK(version_read > 0); + object_entry->next_version_to_read = version_read; + + // The data and metadata size may have changed, so update here before we + // create the Get buffer to return. + object_entry->object.data_size = plasma_header->data_size; + object_entry->object.metadata_size = plasma_header->metadata_size; + object_entry->object.metadata_offset = + object_entry->object.data_offset + object_entry->object.data_size; + RAY_CHECK(object_entry->object.data_size + object_entry->object.metadata_size <= + object_entry->object.allocated_size); #endif return Status::OK(); } @@ -780,7 +790,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease( if (!entry->is_sealed) { return Status::ObjectNotFound("ray.release() called on an object that is not sealed"); } - if (!entry->is_mutable) { + if (!entry->object.is_experimental_mutable_object) { return Status::ObjectNotFound( "ray.release() called on an object that is not mutable"); } @@ -816,7 +826,7 @@ Status PlasmaClient::Impl::Release(const ObjectID &object_id) { const auto object_entry = objects_in_use_.find(object_id); RAY_CHECK(object_entry != objects_in_use_.end()); - if (!object_entry->second->is_mutable) { + if (!object_entry->second->object.is_experimental_mutable_object) { // Release only applies to immutable objects. // TODO(swang): Add a delete call to properly clean up mutable objects. object_entry->second->count -= 1; @@ -894,32 +904,20 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) { } object_entry->second->is_sealed = true; -#ifndef _WIN32 - auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object); - plasma_header->WriteRelease( - /*write_version=*/object_entry->second->next_version_to_write); - // The next Write must pass a higher version. - object_entry->second->next_version_to_write++; - - if (plasma_header->num_readers <= 0) { -#else - { -#endif - // Send the seal request to Plasma. This is the normal Seal path, used for - // immutable objects and the initial Create call for mutable objects. - RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id)); - std::vector buffer; - RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer)); - ObjectID sealed_id; - RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id)); - RAY_CHECK(sealed_id == object_id); - // We call PlasmaClient::Release to decrement the number of instances of this - // object - // that are currently being used by this client. The corresponding increment - // happened in plasma_create and was used to ensure that the object was not - // released before the call to PlasmaClient::Seal. - RAY_RETURN_NOT_OK(Release(object_id)); - } + // Send the seal request to Plasma. This is the normal Seal path, used for + // immutable objects and the initial Create call for mutable objects. + RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id)); + std::vector buffer; + RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer)); + ObjectID sealed_id; + RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id)); + RAY_CHECK(sealed_id == object_id); + // We call PlasmaClient::Release to decrement the number of instances of this + // object + // that are currently being used by this client. The corresponding increment + // happened in plasma_create and was used to ensure that the object was not + // released before the call to PlasmaClient::Seal. + RAY_RETURN_NOT_OK(Release(object_id)); return Status::OK(); } @@ -1061,9 +1059,13 @@ Status PlasmaClient::ExperimentalMutableObjectWriteAcquire( object_id, data_size, metadata, metadata_size, num_readers, data); } +Status PlasmaClient::ExperimentalMutableObjectWriteRelease(const ObjectID &object_id) { + return impl_->ExperimentalMutableObjectWriteRelease(object_id); +} + Status PlasmaClient::CreateAndSpillIfNeeded(const ObjectID &object_id, const ray::rpc::Address &owner_address, - bool is_mutable, + bool is_experimental_mutable_object, int64_t data_size, const uint8_t *metadata, int64_t metadata_size, @@ -1072,7 +1074,7 @@ Status PlasmaClient::CreateAndSpillIfNeeded(const ObjectID &object_id, int device_num) { return impl_->CreateAndSpillIfNeeded(object_id, owner_address, - is_mutable, + is_experimental_mutable_object, data_size, metadata, metadata_size, diff --git a/src/ray/object_manager/plasma/client.h b/src/ray/object_manager/plasma/client.h index e3f1aa1b05e3f..bf9b9099f39b5 100644 --- a/src/ray/object_manager/plasma/client.h +++ b/src/ray/object_manager/plasma/client.h @@ -103,6 +103,13 @@ class PlasmaClientInterface { int64_t num_readers, std::shared_ptr *data) = 0; + /// Experimental method for mutable objects. Releases a write lock on the + /// object, allowing readers to read. This is the equivalent of "Seal" for + /// normal objects. + /// + /// \param[in] object_id The ID of the object. + virtual Status ExperimentalMutableObjectWriteRelease(const ObjectID &object_id) = 0; + /// Experimental method for mutable objects. Releases the objects, allowing them /// to be written again. If the caller did not previously Get the objects, /// then this first blocks until the latest value is available to read, then @@ -238,6 +245,8 @@ class PlasmaClient : public PlasmaClientInterface { int64_t num_readers, std::shared_ptr *data); + Status ExperimentalMutableObjectWriteRelease(const ObjectID &object_id); + /// Create an object in the Plasma Store. Any metadata for this object must be /// be passed in when the object is created. /// diff --git a/src/ray/object_manager/plasma/common.h b/src/ray/object_manager/plasma/common.h index 414af54b4a544..6ffb86fdeb761 100644 --- a/src/ray/object_manager/plasma/common.h +++ b/src/ray/object_manager/plasma/common.h @@ -150,6 +150,7 @@ class LocalObject { object->allocated_size = object->data_size + object->metadata_size; object->device_num = GetAllocation().device_num; object->mmap_size = GetAllocation().mmap_size; + object->is_experimental_mutable_object = object_info.is_mutable; } private: diff --git a/src/ray/object_manager/plasma/plasma.fbs b/src/ray/object_manager/plasma/plasma.fbs index 0c4f8ac66f48b..317e0aad4846a 100644 --- a/src/ray/object_manager/plasma/plasma.fbs +++ b/src/ray/object_manager/plasma/plasma.fbs @@ -112,6 +112,9 @@ struct PlasmaObjectSpec { allocated_size: ulong; // Device to create buffer on. device_num: int; + // Whether this is an experimental mutable object that can be written + // multiple times by a client. + is_experimental_mutable_object: bool; } table PlasmaGetDebugStringRequest { diff --git a/src/ray/object_manager/plasma/plasma.h b/src/ray/object_manager/plasma/plasma.h index bb21f394d5b0c..7b1367181ad34 100644 --- a/src/ray/object_manager/plasma/plasma.h +++ b/src/ray/object_manager/plasma/plasma.h @@ -55,6 +55,7 @@ struct PlasmaObject { int device_num; /// Set if device_num is equal to 0. int64_t mmap_size; + bool is_experimental_mutable_object = false; bool operator==(const PlasmaObject &other) const { return ((store_fd == other.store_fd) && (data_offset == other.data_offset) && diff --git a/src/ray/object_manager/plasma/protocol.cc b/src/ray/object_manager/plasma/protocol.cc index c041486bdefec..fd880daa7c064 100644 --- a/src/ray/object_manager/plasma/protocol.cc +++ b/src/ray/object_manager/plasma/protocol.cc @@ -200,7 +200,7 @@ Status SendCreateRetryRequest(const std::shared_ptr &store_conn, Status SendCreateRequest(const std::shared_ptr &store_conn, ObjectID object_id, const ray::rpc::Address &owner_address, - bool is_mutable, + bool is_experimental_mutable_object, int64_t data_size, int64_t metadata_size, flatbuf::ObjectSource source, @@ -214,7 +214,7 @@ Status SendCreateRequest(const std::shared_ptr &store_conn, fbb.CreateString(owner_address.ip_address()), owner_address.port(), fbb.CreateString(owner_address.worker_id()), - is_mutable, + is_experimental_mutable_object, data_size, metadata_size, source, @@ -269,7 +269,8 @@ Status SendCreateReply(const std::shared_ptr &client, object.metadata_offset, object.metadata_size, object.allocated_size, - object.device_num); + object.device_num, + object.is_experimental_mutable_object); auto object_string = fbb.CreateString(object_id.Binary()); fb::PlasmaCreateReplyBuilder crb(fbb); crb.add_error(static_cast(error_code)); @@ -627,7 +628,8 @@ Status SendGetReply(const std::shared_ptr &client, object.metadata_offset, object.metadata_size, object.allocated_size, - object.device_num)); + object.device_num, + object.is_experimental_mutable_object)); } std::vector store_fds_as_int; std::vector unique_fd_ids; From 2e677c34c6c3c181444f1c857083d894d064cbb6 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 6 Dec 2023 21:49:28 -0800 Subject: [PATCH 39/66] x Signed-off-by: Stephanie Wang --- src/ray/object_manager/plasma/protocol.cc | 4 ++++ src/ray/object_manager/test/object_buffer_pool_test.cc | 1 + 2 files changed, 5 insertions(+) diff --git a/src/ray/object_manager/plasma/protocol.cc b/src/ray/object_manager/plasma/protocol.cc index fd880daa7c064..771cb7087cd35 100644 --- a/src/ray/object_manager/plasma/protocol.cc +++ b/src/ray/object_manager/plasma/protocol.cc @@ -312,6 +312,8 @@ Status ReadCreateReply(uint8_t *data, object->metadata_offset = message->plasma_object()->metadata_offset(); object->metadata_size = message->plasma_object()->metadata_size(); object->allocated_size = message->plasma_object()->allocated_size(); + object->is_experimental_mutable_object = + message->plasma_object()->is_experimental_mutable_object(); store_fd->first = INT2FD(message->store_fd()); store_fd->second = message->unique_fd_id(); @@ -672,6 +674,8 @@ Status ReadGetReply(uint8_t *data, plasma_objects[i].metadata_size = object->metadata_size(); plasma_objects[i].allocated_size = object->allocated_size(); plasma_objects[i].device_num = object->device_num(); + plasma_objects[i].is_experimental_mutable_object = + object->is_experimental_mutable_object(); } RAY_CHECK(message->store_fds()->size() == message->mmap_sizes()->size()); for (uoffset_t i = 0; i < message->store_fds()->size(); i++) { diff --git a/src/ray/object_manager/test/object_buffer_pool_test.cc b/src/ray/object_manager/test/object_buffer_pool_test.cc index 1ae4602f06acd..c3568d65324e5 100644 --- a/src/ray/object_manager/test/object_buffer_pool_test.cc +++ b/src/ray/object_manager/test/object_buffer_pool_test.cc @@ -53,6 +53,7 @@ class MockPlasmaClient : public plasma::PlasmaClientInterface { ray::Status CreateAndSpillIfNeeded(const ObjectID &object_id, const ray::rpc::Address &owner_address, + bool is_experimental_mutable_object, int64_t data_size, const uint8_t *metadata, int64_t metadata_size, From f06b543f8f5cf7d1c698fbb9361fde3f68f3b8d2 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 7 Dec 2023 09:25:14 -0800 Subject: [PATCH 40/66] cpp test Signed-off-by: Stephanie Wang --- src/ray/object_manager/plasma/client.h | 4 ++-- .../object_manager/test/object_buffer_pool_test.cc | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/ray/object_manager/plasma/client.h b/src/ray/object_manager/plasma/client.h index bf9b9099f39b5..d50d1b8c5de9e 100644 --- a/src/ray/object_manager/plasma/client.h +++ b/src/ray/object_manager/plasma/client.h @@ -247,6 +247,8 @@ class PlasmaClient : public PlasmaClientInterface { Status ExperimentalMutableObjectWriteRelease(const ObjectID &object_id); + Status ExperimentalMutableObjectReadRelease(const ObjectID &object_id); + /// Create an object in the Plasma Store. Any metadata for this object must be /// be passed in when the object is created. /// @@ -302,8 +304,6 @@ class PlasmaClient : public PlasmaClientInterface { std::vector *object_buffers, bool is_from_worker); - Status ExperimentalMutableObjectReadRelease(const ObjectID &object_id); - /// Tell Plasma that the client no longer needs the object. This should be /// called after Get() or Create() when the client is done with the object. /// After this call, the buffer returned by Get() is no longer valid. diff --git a/src/ray/object_manager/test/object_buffer_pool_test.cc b/src/ray/object_manager/test/object_buffer_pool_test.cc index c3568d65324e5..249f8bda3b3a8 100644 --- a/src/ray/object_manager/test/object_buffer_pool_test.cc +++ b/src/ray/object_manager/test/object_buffer_pool_test.cc @@ -51,6 +51,20 @@ class MockPlasmaClient : public plasma::PlasmaClientInterface { MOCK_METHOD1(Abort, ray::Status(const ObjectID &object_id)); + MOCK_METHOD6(ExperimentalMutableObjectWriteAcquire, + ray::Status(const ObjectID &object_id, + int64_t data_size, + const uint8_t *metadata, + int64_t metadata_size, + int64_t num_readers, + std::shared_ptr *data)); + + MOCK_METHOD1(ExperimentalMutableObjectWriteRelease, + ray::Status(const ObjectID &object_id)); + + MOCK_METHOD1(ExperimentalMutableObjectReadRelease, + ray::Status(const ObjectID &object_id)); + ray::Status CreateAndSpillIfNeeded(const ObjectID &object_id, const ray::rpc::Address &owner_address, bool is_experimental_mutable_object, From 494cb537d0a86e58b8b83bf5f89eaa1b2c41c642 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 7 Dec 2023 09:48:32 -0800 Subject: [PATCH 41/66] Revert "tmp" This reverts commit 93968107d6de00336ca5af23a3c95e9f655ddc89. --- python/ray/_raylet.pyx | 7 ------- python/ray/experimental/channel.py | 4 ---- src/ray/core_worker/core_worker.cc | 4 ---- 3 files changed, 15 deletions(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 4d823e36bf0fe..595986851ed7c 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -3507,13 +3507,6 @@ cdef class CoreWorker: c_object_id, )) - def experimental_mutable_object_get_current_output_ref(self, ObjectRef channel_ref): - cdef: - CObjectID c_channel_ref = channel_ref - with nogil: - return (CCoreWorkerProcess.GetCoreWorker() - .ExperimentalMutableObjectGetCurrentOutputRef(c_channel_ref)) - def experimental_mutable_object_read_release(self, object_refs): """ For experimental.channel.Channel. diff --git a/python/ray/experimental/channel.py b/python/ray/experimental/channel.py index 11f5e36053822..e8ef9ad085f79 100644 --- a/python/ray/experimental/channel.py +++ b/python/ray/experimental/channel.py @@ -120,10 +120,6 @@ def write(self, value: Any, num_readers: Optional[int] = None): num_readers, ) - def get_current_output_ref(self) -> "ray.ObjectRef": - return self._worker.core_worker.experimental_mutable_object_get_current_output_ref( - self._base_ref) - def begin_read(self) -> Any: """ Read the latest value from the channel. This call will block until a diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 1ac8796e94057..ce1ebe57de8e5 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1307,10 +1307,6 @@ Status CoreWorker::CreateOwnedAndIncrementLocalRef( memory_store_->Put(RayObject(rpc::ErrorType::OBJECT_IN_PLASMA), *object_id)); } } - - if (is_experimental_mutable_object) { - RegisterChannel(*object_id); - } return Status::OK(); } From 05b002fe910b152e744c07f6350a856936653f9c Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 7 Dec 2023 09:49:38 -0800 Subject: [PATCH 42/66] cleanup Signed-off-by: Stephanie Wang --- python/ray/dag/compiled_dag_node.py | 49 ++++++++++++++------ python/ray/dag/dag_node.py | 31 ++++--------- python/ray/dag/tests/test_accelerated_dag.py | 9 ++-- 3 files changed, 49 insertions(+), 40 deletions(-) diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index 74ce83c91a917..a85c8b695a006 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -81,17 +81,17 @@ def __init__(self): self.node_idx_to_output_channels = {} # Cached. - self.dag_input_ref = None + self.dag_input_channel = None self.dag_output_channels = None self.worker_task_refs = [] - def add_node(self, node): + def _add_node(self, node): idx = self.counter self.idx_to_task[idx] = CompiledTask(idx, node) self.dag_node_to_idx[node] = idx self.counter += 1 - def preprocess(self): + def _preprocess(self): from ray.dag import DAGNode, InputNode for idx, task in self.idx_to_task.items(): @@ -116,13 +116,13 @@ def preprocess(self): ) self.output_task_idx = idx - def compiled(self): + def _compiled(self): from ray.dag import DAGNode, InputNode, OutputNode, ClassMethodNode - if self.dag_input_ref is not None and self.dag_output_channels is not None: + if self.dag_input_channel is not None and self.dag_output_channels is not None: # Driver should ray.put on input, ray.get/release on output return ( - self.dag_input_ref, + self.dag_input_channel, self.dag_output_channels, ) @@ -177,7 +177,7 @@ def compiled(self): assert arg_buffer is not None resolved_args.append(arg_buffer) - # TODO: Assign the task with the correct input and output buffers. + # Assign the task with the correct input and output buffers. worker_fn = task.dag_node._get_remote_method("__ray_call__") self.worker_task_refs.append( worker_fn.remote( @@ -187,7 +187,7 @@ def compiled(self): ) ) - self.dag_input_ref = self.idx_to_task[self.input_task_idx].output_channel + self.dag_input_channel = self.idx_to_task[self.input_task_idx].output_channel self.dag_output_channels = [] for output in self.idx_to_task[self.output_task_idx].args: @@ -195,19 +195,42 @@ def compiled(self): output_idx = self.dag_node_to_idx[output] self.dag_output_channels.append(self.idx_to_task[output_idx].output_channel) - assert self.dag_input_ref + assert self.dag_input_channel assert self.dag_output_channels # Driver should ray.put on input, ray.get/release on output - return (self.dag_input_ref, self.dag_output_channels) + return (self.dag_input_channel, self.dag_output_channels) + def execute( + self, + *args, + **kwargs, + ) -> List["ray.experimental.channel.Channel"]: + """Execute this DAG using the compiled execution path. -def build_compiled_dag(dag: "ray.dag.DAGNode"): + Args: + args: Args to the InputNode. + kwargs: Kwargs to the InputNode. Not supported yet. + + Returns: + A list of Channels that can be used to read the DAG result. + """ + if len(args) != 1: + raise NotImplementedError("Compiled DAGs support exactly one InputNode arg") + if len(kwargs) != 0: + raise NotImplementedError("Compiled DAGs do not support kwargs") + + input_channel, output_channels = self._compiled() + input_channel.write(args[0]) + return output_channels + + +def build_compiled_dag_from_ray_dag(dag: "ray.dag.DAGNode"): compiled_dag = CompiledDAG() def _build_compiled_dag(node): - compiled_dag.add_node(node) + compiled_dag._add_node(node) return node dag.apply_recursive(_build_compiled_dag) - compiled_dag.preprocess() + compiled_dag._preprocess() return compiled_dag diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py index 8d8e219f26c50..64a3fd1f1db31 100644 --- a/python/ray/dag/dag_node.py +++ b/python/ray/dag/dag_node.py @@ -16,7 +16,7 @@ import uuid import asyncio -from ray.dag.compiled_dag_node import build_compiled_dag +from ray.dag.compiled_dag_node import build_compiled_dag_from_ray_dag T = TypeVar("T") @@ -107,18 +107,19 @@ async def get_object_refs_from_last_execute(self) -> Dict[str, Any]: def clear_cache(self): self.cache_from_last_execute = {} - def compiled(self) -> Tuple[ray.ObjectRef]: + def experimental_compile(self) -> "ray.dag.CompiledDAG": + """Compile an accelerated execution path for this DAG. The compiled DAG + is cached. + """ if self._compiled_dag is None: - self._compiled_dag = build_compiled_dag(self) + self._compiled_dag = build_compiled_dag_from_ray_dag(self) - return self._compiled_dag.compiled() + return self._compiled_dag def execute( self, *args, _ray_cache_refs: bool = False, - _ray_cache_actors: bool = True, - compiled: bool = False, **kwargs, ) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]: """Execute this DAG using the Ray default executor _execute_impl(). @@ -130,31 +131,15 @@ def execute( - Serve handles for class nodes - resolved values representing user input at runtime """ - if compiled: - assert len(args) == 1, "Compiled DAGs support exactly one InputNode arg" - input_ref, output_channels = self.compiled() - input_ref.write(args[0]) - return output_channels def executor(node): return node._execute_impl(*args, **kwargs) - cache = {} + result = self.apply_recursive(executor) if _ray_cache_refs: - cache = self.cache_from_last_execute - elif _ray_cache_actors: - for key, ref in self.cache_from_last_execute.items(): - if isinstance(ref, ray.actor.ActorHandle): - cache[key] = ref - result = self.apply_recursive(executor, cache=cache) - if _ray_cache_refs or _ray_cache_actors: self.cache_from_last_execute = executor.cache return result - def destroy_compiled_dag(self): - _, _, _, monitor = self.compiled() - monitor.destroy() - def _get_toplevel_child_nodes(self) -> List["DAGNode"]: """Return the list of nodes specified as top-level args. diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py index b07491c1f18c6..9cab0bd70a653 100644 --- a/python/ray/dag/tests/test_accelerated_dag.py +++ b/python/ray/dag/tests/test_accelerated_dag.py @@ -3,13 +3,12 @@ import os import sys -import numpy as np import pytest import ray import ray.cluster_utils -import ray.experimental.channel as ray_channel -from ray.dag import DAGNode, InputNode, OutputNode +from ray.dag import InputNode, OutputNode +from ray.tests.conftest import * # noqa logger = logging.getLogger(__name__) @@ -34,8 +33,10 @@ def test_scatter_gather_dag(ray_start_regular, num_actors): out = [a.inc.bind(i) for a in actors] dag = OutputNode(out) + compiled_dag = dag.experimental_compile() + for i in range(3): - output_channels = dag.execute(1, compiled=True) + output_channels = compiled_dag.execute(1) # TODO(swang): Replace with fake ObjectRef. results = [chan.begin_read() for chan in output_channels] assert results == [init_val + i + 1] * num_actors From 521c73b588f01a80e47a55ad3bb16d148453c60c Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 7 Dec 2023 10:07:02 -0800 Subject: [PATCH 43/66] Support no-OutputNode DAGs Signed-off-by: Stephanie Wang --- python/ray/dag/compiled_dag_node.py | 54 +++++++++++++++----- python/ray/dag/tests/test_accelerated_dag.py | 20 ++++++-- 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index a85c8b695a006..fdefb0f0257af 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Tuple, Union import ray import ray.experimental.channel as ray_channel @@ -6,6 +6,8 @@ MAX_BUFFER_SIZE = int(100 * 1e6) # 100MB +ChannelType = "ray.experimental.channel.Channel" + def allocate_channel(buffer_size_bytes: int = MAX_BUFFER_SIZE, num_readers: int = 1): if not isinstance(buffer_size_bytes, int): @@ -52,7 +54,7 @@ def __init__(self, idx, dag_node: "ray.dag.DAGNode"): self.dag_node = dag_node self.args = [] - self.dependent_node_idxs = [] + self.dependent_node_idxs = set() self.output_channel = None @property @@ -78,6 +80,7 @@ def __init__(self): self.input_task_idx = None self.output_task_idx = None + self.has_single_output = False self.node_idx_to_output_channels = {} # Cached. @@ -92,14 +95,22 @@ def _add_node(self, node): self.counter += 1 def _preprocess(self): - from ray.dag import DAGNode, InputNode + """Before compiling, preprocess the DAG to build an index from task to + upstream and downstream tasks, and to set the input and output node(s) + of the DAG. + """ + from ray.dag import DAGNode, InputNode, OutputNode + # For each task node, set its upstream and downstream task nodes. for idx, task in self.idx_to_task.items(): task.args = task.dag_node.get_args() for arg in task.args: if isinstance(arg, DAGNode): arg_idx = self.dag_node_to_idx[arg] - self.idx_to_task[arg_idx].dependent_node_idxs.append(idx) + self.idx_to_task[arg_idx].dependent_node_idxs.add(idx) + + # Find the input node to the DAG. + for idx, task in self.idx_to_task.items(): if isinstance(task.dag_node, InputNode): assert self.input_task_idx is None, "more than one InputNode found" self.input_task_idx = idx @@ -108,6 +119,7 @@ def _preprocess(self): self.input_task_idx is not None ), "no InputNode found, require exactly one" + # Find the (multi-)output node to the DAG. for idx, task in self.idx_to_task.items(): if len(task.dependent_node_idxs) == 0: assert self.output_task_idx is None, ( @@ -116,9 +128,26 @@ def _preprocess(self): ) self.output_task_idx = idx - def _compiled(self): + assert self.output_task_idx is not None + output_node = self.idx_to_task[self.output_task_idx].dag_node + # Add an OutputNode to the end of the DAG if it's not already there. + if not isinstance(output_node, OutputNode): + self.has_single_output = True + output_node = OutputNode([output_node]) + self._add_node(output_node) + self.output_task_idx = self.dag_node_to_idx[output_node] + # Preprocess one more time so that we have the right output node + # now. + self.input_task_idx, self.output_task_idx = None, None + self._preprocess() + + def _compiled(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]: + """ """ from ray.dag import DAGNode, InputNode, OutputNode, ClassMethodNode + if self.input_task_idx is None: + self._preprocess() + if self.dag_input_channel is not None and self.dag_output_channels is not None: # Driver should ray.put on input, ray.get/release on output return ( @@ -154,11 +183,6 @@ def _compiled(self): for idx in task.dependent_node_idxs: queue.append(idx) - output_node = self.idx_to_task[self.output_task_idx].dag_node - # TODO: Add an OutputNode to the end of the DAG if - # it's not already there. - assert isinstance(output_node, OutputNode) - for node_idx, task in self.idx_to_task.items(): if node_idx == self.input_task_idx: # We don't need to assign an actual task for the input node. @@ -197,6 +221,13 @@ def _compiled(self): assert self.dag_input_channel assert self.dag_output_channels + # If no OutputNode was specified during the DAG creation, there is only + # one output. Return a single output channel instead of a list of + # channels. + if self.has_single_output: + assert len(self.dag_output_channels) == 1 + self.dag_output_channels = self.dag_output_channels[0] + # Driver should ray.put on input, ray.get/release on output return (self.dag_input_channel, self.dag_output_channels) @@ -204,7 +235,7 @@ def execute( self, *args, **kwargs, - ) -> List["ray.experimental.channel.Channel"]: + ) -> Union[ChannelType, List[ChannelType]]: """Execute this DAG using the compiled execution path. Args: @@ -232,5 +263,4 @@ def _build_compiled_dag(node): return node dag.apply_recursive(_build_compiled_dag) - compiled_dag._preprocess() return compiled_dag diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py index 9cab0bd70a653..3414b5dd4183a 100644 --- a/python/ray/dag/tests/test_accelerated_dag.py +++ b/python/ray/dag/tests/test_accelerated_dag.py @@ -25,10 +25,24 @@ def inc(self, x): return self.i +def test_single_output_dag(ray_start_regular): + a = Actor.remote(0) + with InputNode() as i: + dag = a.inc.bind(i) + + compiled_dag = dag.experimental_compile() + + for i in range(3): + output_channel = compiled_dag.execute(1) + # TODO(swang): Replace with fake ObjectRef. + result = output_channel.begin_read() + assert result == i + 1 + output_channel.end_read() + + @pytest.mark.parametrize("num_actors", [1, 4]) def test_scatter_gather_dag(ray_start_regular, num_actors): - init_val = 0 - actors = [Actor.remote(init_val) for _ in range(num_actors)] + actors = [Actor.remote(0) for _ in range(num_actors)] with InputNode() as i: out = [a.inc.bind(i) for a in actors] dag = OutputNode(out) @@ -39,7 +53,7 @@ def test_scatter_gather_dag(ray_start_regular, num_actors): output_channels = compiled_dag.execute(1) # TODO(swang): Replace with fake ObjectRef. results = [chan.begin_read() for chan in output_channels] - assert results == [init_val + i + 1] * num_actors + assert results == [i + 1] * num_actors for chan in output_channels: chan.end_read() From 5b58250704f73adc5dd8b0f80a336d29c1decd91 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 7 Dec 2023 10:16:41 -0800 Subject: [PATCH 44/66] Support non-DAG args Signed-off-by: Stephanie Wang --- python/ray/dag/compiled_dag_node.py | 42 ++++++++++++-------- python/ray/dag/tests/test_accelerated_dag.py | 21 ++++++++++ 2 files changed, 47 insertions(+), 16 deletions(-) diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index fdefb0f0257af..b94366b760bd8 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Union +from typing import Any, List, Tuple, Union import ray import ray.experimental.channel as ray_channel @@ -27,18 +27,30 @@ def do_allocate_channel( def do_exec_compiled_task( self, - input_channels: List["ray_channel.Channel"], + inputs: List[Union[Any, "ray_channel.Channel"]], actor_method_name: str, ): try: - self._input_channels = input_channels method = getattr(self, actor_method_name) + + resolved_inputs = [] + input_channel_idxs = [] + # Add placeholders for input channels. + for inp in inputs: + if isinstance(inp, ray_channel.Channel): + input_channel_idxs.append((len(resolved_inputs), inp)) + resolved_inputs.append(None) + else: + resolved_inputs.append(inp) + while True: - inputs = [chan.begin_read() for chan in input_channels] - output_val = method(*inputs) + for idx, chan in input_channel_idxs: + resolved_inputs[idx] = chan.begin_read() + + output_val = method(*resolved_inputs) self._output_channel.write(output_val) - for chan in input_channels: + for _, chan in input_channel_idxs: chan.end_read() except Exception as e: @@ -122,10 +134,7 @@ def _preprocess(self): # Find the (multi-)output node to the DAG. for idx, task in self.idx_to_task.items(): if len(task.dependent_node_idxs) == 0: - assert self.output_task_idx is None, ( - "More than one output node found, " - "make sure only one node has 0 dependent tasks" - ) + assert self.output_task_idx is None, "More than one output node found" self.output_task_idx = idx assert self.output_task_idx is not None @@ -194,12 +203,13 @@ def _compiled(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]] resolved_args = [] for arg in task.args: - # TODO(swang): Support non-ObjectRef args. - assert isinstance(arg, DAGNode) - arg_idx = self.dag_node_to_idx[arg] - arg_buffer = self.idx_to_task[arg_idx].output_channel - assert arg_buffer is not None - resolved_args.append(arg_buffer) + if isinstance(arg, DAGNode): + arg_idx = self.dag_node_to_idx[arg] + arg_channel = self.idx_to_task[arg_idx].output_channel + assert arg_channel is not None + resolved_args.append(arg_channel) + else: + resolved_args.append(arg) # Assign the task with the correct input and output buffers. worker_fn = task.dag_node._get_remote_method("__ray_call__") diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py index 3414b5dd4183a..dabcf57f62973 100644 --- a/python/ray/dag/tests/test_accelerated_dag.py +++ b/python/ray/dag/tests/test_accelerated_dag.py @@ -24,6 +24,11 @@ def inc(self, x): self.i += x return self.i + def inc_two(self, x, y): + self.i += x + self.i += y + return self.i + def test_single_output_dag(ray_start_regular): a = Actor.remote(0) @@ -40,6 +45,22 @@ def test_single_output_dag(ray_start_regular): output_channel.end_read() +def test_regular_args(ray_start_regular): + # Test passing regular args to .bind in addition to DAGNode args. + a = Actor.remote(0) + with InputNode() as i: + dag = a.inc_two.bind(2, i) + + compiled_dag = dag.experimental_compile() + + for i in range(3): + output_channel = compiled_dag.execute(1) + # TODO(swang): Replace with fake ObjectRef. + result = output_channel.begin_read() + assert result == (i + 1) * 3 + output_channel.end_read() + + @pytest.mark.parametrize("num_actors", [1, 4]) def test_scatter_gather_dag(ray_start_regular, num_actors): actors = [Actor.remote(0) for _ in range(num_actors)] From b5beca4e73df7b3503efb59ea7f1adb770a4bbde Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 7 Dec 2023 10:46:54 -0800 Subject: [PATCH 45/66] errors Signed-off-by: Stephanie Wang --- python/ray/dag/compiled_dag_node.py | 64 +++++++++++++++++--- python/ray/dag/tests/test_accelerated_dag.py | 46 ++++++++++++++ 2 files changed, 100 insertions(+), 10 deletions(-) diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index b94366b760bd8..3414dceca17c6 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -90,14 +90,16 @@ def __init__(self): # idx counter. self.counter = 0 + # Attributes that are set during preprocessing. + # Preprocessing identifies the input node and output node. self.input_task_idx = None self.output_task_idx = None self.has_single_output = False - self.node_idx_to_output_channels = {} - # Cached. + # Cached attributes that are set during compilation. self.dag_input_channel = None self.dag_output_channels = None + self.node_idx_to_output_channels = {} self.worker_task_refs = [] def _add_node(self, node): @@ -111,10 +113,38 @@ def _preprocess(self): upstream and downstream tasks, and to set the input and output node(s) of the DAG. """ - from ray.dag import DAGNode, InputNode, OutputNode + from ray.dag import ( + DAGNode, + ClassMethodNode, + FunctionNode, + InputAttributeNode, + InputNode, + OutputNode, + ) # For each task node, set its upstream and downstream task nodes. for idx, task in self.idx_to_task.items(): + dag_node = task.dag_node + if not ( + isinstance(dag_node, InputNode) + or isinstance(dag_node, OutputNode) + or isinstance(dag_node, ClassMethodNode) + ): + if isinstance(dag_node, InputAttributeNode): + # TODO(swang): Support multi args. + raise ValueError( + "Compiled DAGs currently do not support kwargs or multiple args for InputNode" + ) + elif isinstance(dag_node, FunctionNode): + # TODO(swang): Support non-actor tasks. + raise ValueError( + "Compiled DAGs currently only support actor method nodes" + ) + else: + raise ValueError( + f"Found unsupported node of type {type(task.dag_node)}" + ) + task.args = task.dag_node.get_args() for arg in task.args: if isinstance(arg, DAGNode): @@ -127,9 +157,8 @@ def _preprocess(self): assert self.input_task_idx is None, "more than one InputNode found" self.input_task_idx = idx # TODO: Support no-input DAGs (use an empty object to signal). - assert ( - self.input_task_idx is not None - ), "no InputNode found, require exactly one" + if self.input_task_idx is None: + raise ValueError("Compiled DAGs currently require exactly one InputNode") # Find the (multi-)output node to the DAG. for idx, task in self.idx_to_task.items(): @@ -150,7 +179,7 @@ def _preprocess(self): self.input_task_idx, self.output_task_idx = None, None self._preprocess() - def _compiled(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]: + def _compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]: """ """ from ray.dag import DAGNode, InputNode, OutputNode, ClassMethodNode @@ -202,14 +231,23 @@ def _compiled(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]] continue resolved_args = [] + has_at_least_one_channel_input = False for arg in task.args: if isinstance(arg, DAGNode): arg_idx = self.dag_node_to_idx[arg] arg_channel = self.idx_to_task[arg_idx].output_channel assert arg_channel is not None resolved_args.append(arg_channel) + has_at_least_one_channel_input = True else: resolved_args.append(arg) + # TODO: Support no-input DAGs (use an empty object to signal). + if not has_at_least_one_channel_input: + raise ValueError( + "Compiled DAGs require each task to take a " + "ray.dag.InputNode or at least one other DAGNode as an " + "input" + ) # Assign the task with the correct input and output buffers. worker_fn = task.dag_node._get_remote_method("__ray_call__") @@ -231,6 +269,9 @@ def _compiled(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]] assert self.dag_input_channel assert self.dag_output_channels + assert [ + output_channel is not None for output_channel in self.dag_output_channels + ] # If no OutputNode was specified during the DAG creation, there is only # one output. Return a single output channel instead of a list of # channels. @@ -255,12 +296,14 @@ def execute( Returns: A list of Channels that can be used to read the DAG result. """ + # These errors should already be caught during compilation, but just in + # case. if len(args) != 1: - raise NotImplementedError("Compiled DAGs support exactly one InputNode arg") + raise ValueError("Compiled DAGs support exactly one InputNode arg") if len(kwargs) != 0: - raise NotImplementedError("Compiled DAGs do not support kwargs") + raise ValueError("Compiled DAGs do not support kwargs") - input_channel, output_channels = self._compiled() + input_channel, output_channels = self._compile() input_channel.write(args[0]) return output_channels @@ -273,4 +316,5 @@ def _build_compiled_dag(node): return node dag.apply_recursive(_build_compiled_dag) + compiled_dag._compile() return compiled_dag diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py index dabcf57f62973..405cd2876e84c 100644 --- a/python/ray/dag/tests/test_accelerated_dag.py +++ b/python/ray/dag/tests/test_accelerated_dag.py @@ -79,6 +79,52 @@ def test_scatter_gather_dag(ray_start_regular, num_actors): chan.end_read() +def test_dag_errors(ray_start_regular): + a = Actor.remote(0) + dag = a.inc.bind(1) + with pytest.raises( + ValueError, match="Compiled DAGs currently require exactly one InputNode" + ): + dag.experimental_compile() + + a2 = Actor.remote(0) + with InputNode() as inp: + dag = OutputNode([a.inc.bind(inp), a2.inc.bind(1)]) + with pytest.raises( + ValueError, + match="Compiled DAGs require each task to take a ray.dag.InputNode or " + "at least one other DAGNode as an input", + ): + dag.experimental_compile() + + @ray.remote + def f(x): + return x + + with InputNode() as inp: + dag = f.bind(inp) + with pytest.raises( + ValueError, match="Compiled DAGs currently only support actor method nodes" + ): + dag.experimental_compile() + + with InputNode() as inp: + dag = a.inc_two.bind(inp[0], inp[1]) + with pytest.raises( + ValueError, + match="Compiled DAGs currently do not support kwargs or multiple args for InputNode", + ): + dag.experimental_compile() + + with InputNode() as inp: + dag = a.inc_two.bind(inp.x, inp.y) + with pytest.raises( + ValueError, + match="Compiled DAGs currently do not support kwargs or multiple args for InputNode", + ): + dag.experimental_compile() + + if __name__ == "__main__": if os.environ.get("PARALLEL_CI"): sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__])) From cc2e795c6cf037499e503f2f7ed36634ab619bbe Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 7 Dec 2023 10:47:39 -0800 Subject: [PATCH 46/66] lint Signed-off-by: Stephanie Wang --- python/ray/dag/compiled_dag_node.py | 3 ++- python/ray/dag/tests/test_accelerated_dag.py | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index 3414dceca17c6..8dc6d0db9474d 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -133,7 +133,8 @@ def _preprocess(self): if isinstance(dag_node, InputAttributeNode): # TODO(swang): Support multi args. raise ValueError( - "Compiled DAGs currently do not support kwargs or multiple args for InputNode" + "Compiled DAGs currently do not support kwargs or " + "multiple args for InputNode" ) elif isinstance(dag_node, FunctionNode): # TODO(swang): Support non-actor tasks. diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py index 405cd2876e84c..4702ee45580f1 100644 --- a/python/ray/dag/tests/test_accelerated_dag.py +++ b/python/ray/dag/tests/test_accelerated_dag.py @@ -112,7 +112,8 @@ def f(x): dag = a.inc_two.bind(inp[0], inp[1]) with pytest.raises( ValueError, - match="Compiled DAGs currently do not support kwargs or multiple args for InputNode", + match="Compiled DAGs currently do not support kwargs or multiple args " + "for InputNode", ): dag.experimental_compile() @@ -120,7 +121,8 @@ def f(x): dag = a.inc_two.bind(inp.x, inp.y) with pytest.raises( ValueError, - match="Compiled DAGs currently do not support kwargs or multiple args for InputNode", + match="Compiled DAGs currently do not support kwargs or multiple args " + "for InputNode", ): dag.experimental_compile() From c17c3671e2278e993f5b5239c3bef3ba135acfde Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 7 Dec 2023 10:50:15 -0800 Subject: [PATCH 47/66] doc Signed-off-by: Stephanie Wang --- python/ray/dag/compiled_dag_node.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index 8dc6d0db9474d..35890b3367939 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -181,7 +181,17 @@ def _preprocess(self): self._preprocess() def _compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]: - """ """ + """Compile an execution path. This allocates channels for adjacent + tasks to send/receive values. An infinite task is submitted to each + actor in the DAG that repeatedly receives from input channel(s) and + sends to output channel(s). + + Returns: + A tuple of (input channel, output channel(s)). The input channel + that should be used by the caller to submit a DAG execution. The + output channel(s) should be read by the caller to get the DAG + output. + """ from ray.dag import DAGNode, InputNode, OutputNode, ClassMethodNode if self.input_task_idx is None: From 4dfa31eda5346b74d5e7b1e774eab2a193e1d148 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 7 Dec 2023 10:52:06 -0800 Subject: [PATCH 48/66] skip tests on windows Signed-off-by: Stephanie Wang --- python/ray/tests/test_channel.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/ray/tests/test_channel.py b/python/ray/tests/test_channel.py index 0bd008593a110..3ff33df76bc5d 100644 --- a/python/ray/tests/test_channel.py +++ b/python/ray/tests/test_channel.py @@ -13,6 +13,7 @@ logger = logging.getLogger(__name__) +@pytest.mark.skipif(sys.platform == "win32", reason="Requires POSIX.") def test_put_local_get(ray_start_regular): chan = ray_channel.Channel(1000) @@ -28,6 +29,7 @@ def test_put_local_get(ray_start_regular): chan.end_read() +@pytest.mark.skipif(sys.platform == "win32", reason="Requires POSIX.") def test_errors(ray_start_regular): @ray.remote class Actor: @@ -72,6 +74,7 @@ def read(self, chan): assert "ray.exceptions.RaySystemError" in str(exc_info.value) +@pytest.mark.skipif(sys.platform == "win32", reason="Requires POSIX.") def test_put_different_meta(ray_start_regular): chan = ray_channel.Channel(1000) @@ -97,6 +100,7 @@ def _test(val): _test(np.random.rand(1)) +@pytest.mark.skipif(sys.platform == "win32", reason="Requires POSIX.") @pytest.mark.parametrize("num_readers", [1, 4]) def test_put_remote_get(ray_start_regular, num_readers): chan = ray_channel.Channel(1000) From 03f4fbd1a4e4b156b384441ce39e07f87a43c69e Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Fri, 8 Dec 2023 09:24:25 -0800 Subject: [PATCH 49/66] larger CI machine Signed-off-by: Stephanie Wang --- .buildkite/core.rayci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/core.rayci.yml b/.buildkite/core.rayci.yml index d8ddef502e1f7..c4beb251b10c1 100644 --- a/.buildkite/core.rayci.yml +++ b/.buildkite/core.rayci.yml @@ -210,7 +210,7 @@ steps: - label: ":ray: core: cpp ubsan tests" tags: core_cpp - instance_type: medium + instance_type: large commands: - bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core --build-type ubsan --except-tags no_ubsan From 7dde158bffc5bbdee778837d9277692e0284d189 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Fri, 8 Dec 2023 17:35:00 -0800 Subject: [PATCH 50/66] cleanup Signed-off-by: Stephanie Wang --- python/ray/dag/class_node.py | 9 +- python/ray/dag/compiled_dag_node.py | 119 ++++++++++++++----- python/ray/dag/dag_node.py | 2 +- python/ray/dag/tests/test_accelerated_dag.py | 43 +++++++ python/ray/experimental/channel.py | 33 +++-- 5 files changed, 159 insertions(+), 47 deletions(-) diff --git a/python/ray/dag/class_node.py b/python/ray/dag/class_node.py index 8daf406dd2e1c..9e0e1c3596a22 100644 --- a/python/ray/dag/class_node.py +++ b/python/ray/dag/class_node.py @@ -7,7 +7,7 @@ from ray.dag.constants import PARENT_CLASS_NODE_KEY from ray.util.annotations import DeveloperAPI -from typing import Any, Dict, List, Union, Tuple +from typing import Any, Dict, List, Optional, Union, Tuple @DeveloperAPI @@ -146,7 +146,7 @@ def __init__( self._method_name: str = method_name # Parse other_args_to_resolve and assign to variables self._parent_class_node: Union[ - ClassNode, ReferenceType["ray._private.actor.ActorHandle"] + ClassNode, ReferenceType["ray.actor.ActorHandle"] ] = other_args_to_resolve.get(PARENT_CLASS_NODE_KEY) # The actor creation task dependency is encoded as the first argument, # and the ordering dependency as the second, which ensures they are @@ -197,3 +197,8 @@ def get_method_name(self) -> str: def _get_remote_method(self, method_name): method_body = getattr(self._parent_class_node, method_name) return method_body + + def _get_actor_handle(self) -> Optional[ReferenceType["ray.actor.ActorHandle"]]: + if not isinstance(self._parent_class_node, ray.actor.ActorHandle): + return None + return self._parent_class_node diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index 35890b3367939..17f6362b6bfd7 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -1,5 +1,8 @@ +import logging from typing import Any, List, Tuple, Union +from collections import defaultdict + import ray import ray.experimental.channel as ray_channel @@ -8,70 +11,93 @@ ChannelType = "ray.experimental.channel.Channel" - -def allocate_channel(buffer_size_bytes: int = MAX_BUFFER_SIZE, num_readers: int = 1): - if not isinstance(buffer_size_bytes, int): - raise ValueError("buffer_size_bytes must be an integer") - if not isinstance(num_readers, int): - raise ValueError("num_readers must be an integer") - - return ray_channel.Channel(buffer_size_bytes, num_readers) +logger = logging.getLogger(__name__) def do_allocate_channel( self, buffer_size_bytes: int = MAX_BUFFER_SIZE, num_readers: int = 1 -): - self._output_channel = allocate_channel(buffer_size_bytes) +) -> ChannelType: + """Generic actor method to allocate an output channel. + + Args: + buffer_size_bytes: The maximum size of messages in the channel. + num_readers: The number of readers per message. + + Returns: + The allocated channel. + """ + self._output_channel = ray_channel.Channel(buffer_size_bytes, num_readers) return self._output_channel def do_exec_compiled_task( self, - inputs: List[Union[Any, "ray_channel.Channel"]], + inputs: List[Union[Any, ChannelType]], actor_method_name: str, -): +) -> None: + """Generic actor method to begin executing a compiled DAG. This runs an + infinite loop to repeatedly read input channel(s), execute the given + method, and write output channel(s). It only exits if the actor dies or an + exception is thrown. + + Args: + inputs: The arguments to the task. Arguments that are not Channels will + get passed through to the actor method. If the argument is a channel, + it will be replaced by the value read from the channel before the + method execute. + actor_method_name: The name of the actual actor method to execute in + the loop. + """ try: method = getattr(self, actor_method_name) resolved_inputs = [] input_channel_idxs = [] # Add placeholders for input channels. - for inp in inputs: + for idx, inp in enumerate(inputs): if isinstance(inp, ray_channel.Channel): - input_channel_idxs.append((len(resolved_inputs), inp)) + input_channel_idxs.append((idx, inp)) resolved_inputs.append(None) else: resolved_inputs.append(inp) while True: - for idx, chan in input_channel_idxs: - resolved_inputs[idx] = chan.begin_read() + for idx, channel in input_channel_idxs: + resolved_inputs[idx] = channel.begin_read() output_val = method(*resolved_inputs) self._output_channel.write(output_val) - for _, chan in input_channel_idxs: - chan.end_read() + for _, channel in input_channel_idxs: + channel.end_read() except Exception as e: - print("Task aborted", e) + logging.warn(f"Compiled DAG task aborted with exception: {e}") raise class CompiledTask: """Wraps the normal Ray DAGNode with some metadata.""" - def __init__(self, idx, dag_node: "ray.dag.DAGNode"): + def __init__(self, idx: int, dag_node: "ray.dag.DAGNode"): + """ + Args: + idx: A unique index into the original DAG. + dag_node: The original DAG node created by the user. + """ self.idx = idx self.dag_node = dag_node - self.args = [] - self.dependent_node_idxs = set() + self.downstream_node_idxs = set() self.output_channel = None + @property + def args(self): + return self.dag_node.get_args() + @property def num_readers(self): - return len(self.dependent_node_idxs) + return len(self.downstream_node_idxs) def __str__(self): return f""" @@ -95,23 +121,27 @@ def __init__(self): self.input_task_idx = None self.output_task_idx = None self.has_single_output = False + self.actor_task_count = defaultdict(int) # Cached attributes that are set during compilation. self.dag_input_channel = None self.dag_output_channels = None - self.node_idx_to_output_channels = {} + # ObjectRef for each worker's task. The task is an infinite loop that + # repeatedly executes the method specified in the DAG. self.worker_task_refs = [] - def _add_node(self, node): + def _add_node(self, node: "ray.dag.DAGNode") -> None: idx = self.counter self.idx_to_task[idx] = CompiledTask(idx, node) self.dag_node_to_idx[node] = idx self.counter += 1 - def _preprocess(self): + def _preprocess(self) -> None: """Before compiling, preprocess the DAG to build an index from task to upstream and downstream tasks, and to set the input and output node(s) of the DAG. + + This function is idempotent. """ from ray.dag import ( DAGNode, @@ -122,6 +152,9 @@ def _preprocess(self): OutputNode, ) + self.input_task_idx, self.output_task_idx = None, None + self.actor_task_count.clear() + # For each task node, set its upstream and downstream task nodes. for idx, task in self.idx_to_task.items(): dag_node = task.dag_node @@ -146,11 +179,26 @@ def _preprocess(self): f"Found unsupported node of type {type(task.dag_node)}" ) - task.args = task.dag_node.get_args() + if isinstance(dag_node, ClassMethodNode): + actor_handle = dag_node._get_actor_handle() + if actor_handle is None: + raise ValueError( + "Compiled DAGs can only bind methods to an actor " + "that is already created with Actor.remote()" + ) + self.actor_task_count[actor_handle._actor_id] += 1 + for arg in task.args: if isinstance(arg, DAGNode): arg_idx = self.dag_node_to_idx[arg] - self.idx_to_task[arg_idx].dependent_node_idxs.add(idx) + self.idx_to_task[arg_idx].downstream_node_idxs.add(idx) + + for actor_id, task_count in self.actor_task_count.items(): + if task_count > 1: + raise ValueError( + "Compiled DAGs can contain at most one task per actor handle. " + f"Actor with ID {actor_id} appears {task_count}x." + ) # Find the input node to the DAG. for idx, task in self.idx_to_task.items(): @@ -163,7 +211,7 @@ def _preprocess(self): # Find the (multi-)output node to the DAG. for idx, task in self.idx_to_task.items(): - if len(task.dependent_node_idxs) == 0: + if len(task.downstream_node_idxs) == 0: assert self.output_task_idx is None, "More than one output node found" self.output_task_idx = idx @@ -177,7 +225,6 @@ def _preprocess(self): self.output_task_idx = self.dag_node_to_idx[output_node] # Preprocess one more time so that we have the right output node # now. - self.input_task_idx, self.output_task_idx = None, None self._preprocess() def _compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]: @@ -186,6 +233,9 @@ def _compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]: actor in the DAG that repeatedly receives from input channel(s) and sends to output channel(s). + This function is idempotent and will cache the previously allocated + channels. + Returns: A tuple of (input channel, output channel(s)). The input channel that should be used by the caller to submit a DAG execution. The @@ -197,7 +247,8 @@ def _compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]: if self.input_task_idx is None: self._preprocess() - if self.dag_input_channel is not None and self.dag_output_channels is not None: + if self.dag_input_channel is not None: + assert self.dag_output_channels is not None # Driver should ray.put on input, ray.get/release on output return ( self.dag_input_channel, @@ -225,11 +276,13 @@ def _compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]: ) ) elif isinstance(task.dag_node, InputNode): - task.output_channel = allocate_channel(num_readers=task.num_readers) + task.output_channel = ray_channel.Channel( + buffer_size_bytes=MAX_BUFFER_SIZE, num_readers=task.num_readers + ) else: assert isinstance(task.dag_node, OutputNode) - for idx in task.dependent_node_idxs: + for idx in task.downstream_node_idxs: queue.append(idx) for node_idx, task in self.idx_to_task.items(): diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py index 64a3fd1f1db31..fa029457fa73c 100644 --- a/python/ray/dag/dag_node.py +++ b/python/ray/dag/dag_node.py @@ -234,7 +234,7 @@ def _apply_and_replace_all_child_nodes( new_args, new_kwargs, self.get_options(), new_other_args_to_resolve ) - def apply_recursive(self, fn: "Callable[[DAGNode], T]", cache=None) -> T: + def apply_recursive(self, fn: "Callable[[DAGNode], T]") -> T: """Apply callable on each node in this DAG in a bottom-up tree walk. Args: diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py index 4702ee45580f1..49aecd182be7f 100644 --- a/python/ray/dag/tests/test_accelerated_dag.py +++ b/python/ray/dag/tests/test_accelerated_dag.py @@ -9,6 +9,7 @@ import ray.cluster_utils from ray.dag import InputNode, OutputNode from ray.tests.conftest import * # noqa +from ray._private.test_utils import wait_for_condition logger = logging.getLogger(__name__) @@ -24,6 +25,10 @@ def inc(self, x): self.i += x return self.i + def append_to(self, lst): + lst.append(self.i) + return lst + def inc_two(self, x, y): self.i += x self.i += y @@ -79,6 +84,36 @@ def test_scatter_gather_dag(ray_start_regular, num_actors): chan.end_read() +@pytest.mark.parametrize("num_actors", [1, 4]) +def test_chain_dag(ray_start_regular, num_actors): + actors = [Actor.remote(i) for i in range(num_actors)] + with InputNode() as inp: + dag = inp + for a in actors: + dag = a.append_to.bind(dag) + + compiled_dag = dag.experimental_compile() + + for i in range(3): + output_channel = compiled_dag.execute([]) + # TODO(swang): Replace with fake ObjectRef. + result = output_channel.begin_read() + assert result == list(range(num_actors)) + output_channel.end_read() + + +def test_dag_exception(ray_start_regular, capsys): + a = Actor.remote(0) + with InputNode() as inp: + dag = a.inc.bind(inp) + + compiled_dag = dag.experimental_compile() + output_channel = compiled_dag.execute("hello") + wait_for_condition( + lambda: "Compiled DAG task aborted with exception" in capsys.readouterr().err + ) + + def test_dag_errors(ray_start_regular): a = Actor.remote(0) dag = a.inc.bind(1) @@ -108,6 +143,14 @@ def f(x): ): dag.experimental_compile() + with InputNode() as inp: + dag = a.inc.bind(inp) + dag = a.inc.bind(dag) + with pytest.raises( + ValueError, match="Compiled DAGs can contain at most one task per actor handle." + ): + dag.experimental_compile() + with InputNode() as inp: dag = a.inc_two.bind(inp[0], inp[1]) with pytest.raises( diff --git a/python/ray/experimental/channel.py b/python/ray/experimental/channel.py index e8ef9ad085f79..310fa870c0fd1 100644 --- a/python/ray/experimental/channel.py +++ b/python/ray/experimental/channel.py @@ -12,7 +12,7 @@ def _create_channel_ref( - buffer_size: int, + buffer_size_bytes: int, ) -> "ray.ObjectRef": """ Create a channel that can be read and written by co-located Ray processes. @@ -21,7 +21,7 @@ def _create_channel_ref( read the previous value. Only the channel creator may write to the channel. Args: - buffer_size: The number of bytes to allocate for the object data and + buffer_size_bytes: The number of bytes to allocate for the object data and metadata. Writes to the channel must produce serialized data and metadata less than or equal to this value. Returns: @@ -30,7 +30,7 @@ def _create_channel_ref( worker = ray._private.worker.global_worker worker.check_connected() - value = b"0" * buffer_size + value = b"0" * buffer_size_bytes try: object_ref = worker.put_object( @@ -52,7 +52,12 @@ class Channel: ray.wait. """ - def __init__(self, buffer_size: Optional[int] = None, num_readers: int = 1): + def __init__( + self, + buffer_size_bytes: Optional[int] = None, + num_readers: int = 1, + _base_ref: Optional["ray.ObjectRef"] = None, + ): """ Create a channel that can be read and written by co-located Ray processes. @@ -60,26 +65,32 @@ def __init__(self, buffer_size: Optional[int] = None, num_readers: int = 1): so the writer will block until reader(s) have read the previous value. Args: - buffer_size: The number of bytes to allocate for the object data and + buffer_size_bytes: The number of bytes to allocate for the object data and metadata. Writes to the channel must produce serialized data and metadata less than or equal to this value. Returns: Channel: A wrapper around ray.ObjectRef. """ - if buffer_size is None: - self._base_ref = None + if buffer_size_bytes is None: + if _base_ref is None: + raise ValueError( + "One of `buffer_size_bytes` or `_base_ref` must be provided" + ) + self._base_ref = _base_ref else: - self._base_ref = _create_channel_ref(buffer_size) + if not isinstance(buffer_size_bytes, int): + raise ValueError("buffer_size_bytes must be an integer") + self._base_ref = _create_channel_ref(buffer_size_bytes) + if not isinstance(num_readers, int): + raise ValueError("num_readers must be an integer") self._num_readers = num_readers self._worker = ray._private.worker.global_worker self._worker.check_connected() @staticmethod def _from_base_ref(base_ref: "ray.ObjectRef", num_readers: int) -> "Channel": - chan = Channel(num_readers=num_readers) - chan._base_ref = base_ref - return chan + return Channel(num_readers=num_readers, _base_ref=base_ref) def __reduce__(self): return self._from_base_ref, (self._base_ref, self._num_readers) From 63cc16d55f2b37ba4e13f67a68d85bcbf7f2c1d2 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Fri, 8 Dec 2023 17:35:11 -0800 Subject: [PATCH 51/66] cleanup Signed-off-by: Stephanie Wang --- python/ray/dag/tests/test_accelerated_dag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py index 49aecd182be7f..4435235b52d05 100644 --- a/python/ray/dag/tests/test_accelerated_dag.py +++ b/python/ray/dag/tests/test_accelerated_dag.py @@ -108,7 +108,7 @@ def test_dag_exception(ray_start_regular, capsys): dag = a.inc.bind(inp) compiled_dag = dag.experimental_compile() - output_channel = compiled_dag.execute("hello") + compiled_dag.execute("hello") wait_for_condition( lambda: "Compiled DAG task aborted with exception" in capsys.readouterr().err ) From dca12399b057c01a4fb25307a2b43d401ba0fc7e Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Fri, 8 Dec 2023 17:50:31 -0800 Subject: [PATCH 52/66] perf Signed-off-by: Stephanie Wang --- python/ray/_private/ray_perf.py | 46 +++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py index 330527957d675..99401241ceb22 100644 --- a/python/ray/_private/ray_perf.py +++ b/python/ray/_private/ray_perf.py @@ -9,6 +9,7 @@ import ray import ray.experimental.channel as ray_channel +from ray.dag import InputNode, OutputNode logger = logging.getLogger(__name__) @@ -369,6 +370,51 @@ def read(self, chans): for reader in readers: ray.kill(reader) + # Tests for compiled DAGs. + + def _exec(dag): + output_channel = dag.execute(b"x") + output_channel.begin_read() + output_channel.end_read() + + def _exec_multi_output(dag): + output_channels = dag.execute(b"x") + for output_channel in output_channels: + output_channel.begin_read() + for output_channel in output_channels: + output_channel.end_read() + + @ray.remote + class Actor: + def echo(self, x): + return x + + a = Actor.remote() + with InputNode() as inp: + dag = a.echo.bind(inp) + + dag = dag.experimental_compile() + results += timeit("compiled single-actor DAG calls", lambda: _exec(dag)) + + del a + n_cpu = multiprocessing.cpu_count() // 2 + actors = [Actor.remote() for _ in range(n_cpu)] + with InputNode() as inp: + dag = OutputNode([a.echo.bind(inp) for a in actors]) + dag = dag.experimental_compile() + results += timeit( + f"compiled scatter-gather DAG calls, n={n_cpu} actors", + lambda: _exec_multi_output(dag), + ) + + actors = [Actor.remote() for _ in range(n_cpu)] + with InputNode() as inp: + dag = inp + for a in actors: + dag = a.echo.bind(dag) + dag = dag.experimental_compile() + results += timeit(f"compiled chain DAG calls, n={n_cpu} actors", lambda: _exec(dag)) + ray.shutdown() ############################ From 7b8472b70438a0d1c45b8a47817ef0afc50ec45f Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Sat, 9 Dec 2023 11:28:42 -0800 Subject: [PATCH 53/66] add normal DAG Signed-off-by: Stephanie Wang --- python/ray/_private/ray_perf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py index 99401241ceb22..73d1f2afb0acd 100644 --- a/python/ray/_private/ray_perf.py +++ b/python/ray/_private/ray_perf.py @@ -393,6 +393,7 @@ def echo(self, x): with InputNode() as inp: dag = a.echo.bind(inp) + results += timeit("single-actor DAG calls", lambda: ray.get(dag.execute(b"x"))) dag = dag.experimental_compile() results += timeit("compiled single-actor DAG calls", lambda: _exec(dag)) @@ -401,6 +402,7 @@ def echo(self, x): actors = [Actor.remote() for _ in range(n_cpu)] with InputNode() as inp: dag = OutputNode([a.echo.bind(inp) for a in actors]) + results += timeit("scatter-gather DAG calls", lambda: ray.get(dag.execute(b"x"))) dag = dag.experimental_compile() results += timeit( f"compiled scatter-gather DAG calls, n={n_cpu} actors", @@ -412,6 +414,9 @@ def echo(self, x): dag = inp for a in actors: dag = a.echo.bind(dag) + results += timeit( + f"chain DAG calls, n={n_cpu} actors", lambda: ray.get(dag.execute(b"x")) + ) dag = dag.experimental_compile() results += timeit(f"compiled chain DAG calls, n={n_cpu} actors", lambda: _exec(dag)) From 740169bf761576be79bc412bed621615eea622ef Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Sat, 9 Dec 2023 11:39:05 -0800 Subject: [PATCH 54/66] x Signed-off-by: Stephanie Wang --- python/ray/_private/ray_perf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py index 73d1f2afb0acd..8d1a56f09cc98 100644 --- a/python/ray/_private/ray_perf.py +++ b/python/ray/_private/ray_perf.py @@ -402,7 +402,9 @@ def echo(self, x): actors = [Actor.remote() for _ in range(n_cpu)] with InputNode() as inp: dag = OutputNode([a.echo.bind(inp) for a in actors]) - results += timeit("scatter-gather DAG calls", lambda: ray.get(dag.execute(b"x"))) + results += timeit( + "scatter-gather DAG calls, n={n_cpu} actors", lambda: ray.get(dag.execute(b"x")) + ) dag = dag.experimental_compile() results += timeit( f"compiled scatter-gather DAG calls, n={n_cpu} actors", From 905a5bc4f01c62c8ededaa81abfe4318b1da0653 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 12 Dec 2023 10:29:57 -0800 Subject: [PATCH 55/66] merge Signed-off-by: Stephanie Wang --- python/ray/_private/ray_perf.py | 4 +- python/ray/dag/constants.py | 1 + python/ray/dag/dag_node.py | 7 -- python/ray/dag/tests/test_accelerated_dag.py | 8 +- python/ray/dag/tests/test_accelerator_dag.py | 122 ------------------- 5 files changed, 7 insertions(+), 135 deletions(-) delete mode 100644 python/ray/dag/tests/test_accelerator_dag.py diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py index 8d1a56f09cc98..d1c1e7e1abf6c 100644 --- a/python/ray/_private/ray_perf.py +++ b/python/ray/_private/ray_perf.py @@ -9,7 +9,7 @@ import ray import ray.experimental.channel as ray_channel -from ray.dag import InputNode, OutputNode +from ray.dag import InputNode, MultiOutputNode logger = logging.getLogger(__name__) @@ -401,7 +401,7 @@ def echo(self, x): n_cpu = multiprocessing.cpu_count() // 2 actors = [Actor.remote() for _ in range(n_cpu)] with InputNode() as inp: - dag = OutputNode([a.echo.bind(inp) for a in actors]) + dag = MultiOutputNode([a.echo.bind(inp) for a in actors]) results += timeit( "scatter-gather DAG calls, n={n_cpu} actors", lambda: ray.get(dag.execute(b"x")) ) diff --git a/python/ray/dag/constants.py b/python/ray/dag/constants.py index 77ccb6cc35b78..d2d309d56bdaa 100644 --- a/python/ray/dag/constants.py +++ b/python/ray/dag/constants.py @@ -1,5 +1,6 @@ # Reserved keys used to handle ClassMethodNode in Ray DAG building. PARENT_CLASS_NODE_KEY = "parent_class_node" +PREV_CLASS_METHOD_CALL_KEY = "prev_class_method_call" # Reserved key to distinguish DAGNode type and avoid collision with user dict. DAGNODE_TYPE_KEY = "__dag_node_type__" diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py index 81dbd2b3d128f..6d15889eea8c2 100644 --- a/python/ray/dag/dag_node.py +++ b/python/ray/dag/dag_node.py @@ -117,14 +117,7 @@ def experimental_compile(self) -> "ray.dag.CompiledDAG": return self._compiled_dag def execute( -<<<<<<< HEAD - self, - *args, - _ray_cache_refs: bool = False, - **kwargs, -======= self, *args, _ray_cache_refs: bool = False, **kwargs ->>>>>>> 1a090a0f13492fbaa0561514488bf9b3638af6af ) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]: """Execute this DAG using the Ray default executor _execute_impl(). diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py index 4435235b52d05..13a90ceef3b13 100644 --- a/python/ray/dag/tests/test_accelerated_dag.py +++ b/python/ray/dag/tests/test_accelerated_dag.py @@ -7,7 +7,7 @@ import ray import ray.cluster_utils -from ray.dag import InputNode, OutputNode +from ray.dag import InputNode, MultiOutputNode from ray.tests.conftest import * # noqa from ray._private.test_utils import wait_for_condition @@ -35,7 +35,7 @@ def inc_two(self, x, y): return self.i -def test_single_output_dag(ray_start_regular): +def test_basic(ray_start_regular): a = Actor.remote(0) with InputNode() as i: dag = a.inc.bind(i) @@ -71,7 +71,7 @@ def test_scatter_gather_dag(ray_start_regular, num_actors): actors = [Actor.remote(0) for _ in range(num_actors)] with InputNode() as i: out = [a.inc.bind(i) for a in actors] - dag = OutputNode(out) + dag = MultiOutputNode(out) compiled_dag = dag.experimental_compile() @@ -124,7 +124,7 @@ def test_dag_errors(ray_start_regular): a2 = Actor.remote(0) with InputNode() as inp: - dag = OutputNode([a.inc.bind(inp), a2.inc.bind(1)]) + dag = MultiOutputNode([a.inc.bind(inp), a2.inc.bind(1)]) with pytest.raises( ValueError, match="Compiled DAGs require each task to take a ray.dag.InputNode or " diff --git a/python/ray/dag/tests/test_accelerator_dag.py b/python/ray/dag/tests/test_accelerator_dag.py deleted file mode 100644 index 7114a4f0f0ac7..0000000000000 --- a/python/ray/dag/tests/test_accelerator_dag.py +++ /dev/null @@ -1,122 +0,0 @@ -import pytest - -import ray -from ray.dag.input_node import InputNode -from ray.dag.output_node import OutputNode - - -def test_output_node(shared_ray_instance): - @ray.remote - def f(input): - return input - - with pytest.raises(ValueError): - with InputNode() as input_data: - dag = OutputNode(f.bind(input_data)) - - with InputNode() as input_data: - dag = OutputNode([f.bind(input_data)]) - - assert ray.get(dag.execute(1)) == [1] - assert ray.get(dag.execute(2)) == [2] - - with InputNode() as input_data: - dag = OutputNode([f.bind(input_data["x"]), f.bind(input_data["y"])]) - - refs = dag.execute({"x": 1, "y": 2}) - assert len(refs) == 2 - assert ray.get(refs) == [1, 2] - - with InputNode() as input_data: - dag = OutputNode( - [f.bind(input_data["x"]), f.bind(input_data["y"]), f.bind(input_data["x"])] - ) - - refs = dag.execute({"x": 1, "y": 2}) - assert len(refs) == 3 - assert ray.get(refs) == [1, 2, 1] - - -def test_dag_with_actor_handle(shared_ray_instance): - """Verify DAG API works with actor created by .remote""" - - @ray.remote - class Worker: - def __init__(self): - self.forward_called = 0 - self.init_called = 0 - - def forward(self, input): - print("forward") - self.forward_called += 1 - return input - - def initialize(self, input): - print("initialize") - self.init_called += 1 - return input - - def get(self): - return (self.forward_called, self.init_called) - - worker = Worker.remote() - with InputNode() as input_node: - init_dag = worker.initialize.bind(input_node) - with InputNode() as input_node: - forward_dag = worker.forward.bind(input_node) - - assert ray.get(init_dag.execute(1)) == 1 - assert ray.get(forward_dag.execute(2)) == 2 - - # Make sure both forward/initialize called only once - assert ray.get(worker.get.remote()) == (1, 1) - - # Double check the actor is resued. - assert ray.get(init_dag.execute(1)) == 1 - assert ray.get(worker.get.remote()) == (1, 2) - - -def test_tensor_parallel_dag(shared_ray_instance): - @ray.remote - class Worker: - def __init__(self, rank): - self.rank = rank - self.forwarded = 0 - - def forward(self, input_data: int): - print(input_data) - self.forwarded += 1 - return self.rank + input_data - - def initialize(self): - pass - - def get_forwarded(self): - return self.forwarded - - NUM_WORKERS = 4 - workers = [Worker.remote(i) for i in range(NUM_WORKERS)] - # Init multiple times. - for _ in range(4): - ray.get([worker.initialize.remote() for worker in workers]) - - with InputNode() as input_data: - dag = OutputNode([worker.forward.bind(input_data) for worker in workers]) - - # Run DAG repetitively. - ITER = 4 - assert ITER > 1 - for i in range(ITER): - ref = dag.execute(i) - all_outputs = ray.get(ref) - assert len(all_outputs) == NUM_WORKERS - assert all_outputs == [i + j for j in range(NUM_WORKERS)] - - forwarded = ray.get([worker.get_forwarded.remote() for worker in workers]) - assert forwarded == [ITER for _ in range(NUM_WORKERS)] - - -if __name__ == "__main__": - import sys - - sys.exit(pytest.main(["-v", __file__])) From 4436b1f55fe5d34b41e0c46f4045de195b5add0f Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 12 Dec 2023 10:33:35 -0800 Subject: [PATCH 56/66] revert Signed-off-by: Stephanie Wang --- python/ray/dag/class_node.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/python/ray/dag/class_node.py b/python/ray/dag/class_node.py index 6c07bc2ace1b3..e4a74f83c7495 100644 --- a/python/ray/dag/class_node.py +++ b/python/ray/dag/class_node.py @@ -23,6 +23,7 @@ def __init__( other_args_to_resolve=None, ): self._body = cls + self._last_call: Optional["ClassMethodNode"] = None super().__init__( cls_args, cls_kwargs, @@ -102,6 +103,7 @@ def __init__(self, actor: ClassNode, method_name: str, options: dict): def bind(self, *args, **kwargs): other_args_to_resolve = { PARENT_CLASS_NODE_KEY: self._actor, + PREV_CLASS_METHOD_CALL_KEY: self._actor._last_call, } node = ClassMethodNode( @@ -111,6 +113,7 @@ def bind(self, *args, **kwargs): self._options, other_args_to_resolve=other_args_to_resolve, ) + self._actor._last_call = node return node def __getattr__(self, attr: str): @@ -199,12 +202,3 @@ def __str__(self) -> str: def get_method_name(self) -> str: return self._method_name - - def _get_remote_method(self, method_name): - method_body = getattr(self._parent_class_node, method_name) - return method_body - - def _get_actor_handle(self) -> Optional[ReferenceType["ray.actor.ActorHandle"]]: - if not isinstance(self._parent_class_node, ray.actor.ActorHandle): - return None - return self._parent_class_node From f105ed517d9e70cf75db8c4ea47806655a70ca4a Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 12 Dec 2023 10:34:02 -0800 Subject: [PATCH 57/66] revert Signed-off-by: Stephanie Wang --- python/ray/dag/tests/test_class_dag.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/python/ray/dag/tests/test_class_dag.py b/python/ray/dag/tests/test_class_dag.py index 2c94b501d006e..61e09ef85a6a4 100644 --- a/python/ray/dag/tests/test_class_dag.py +++ b/python/ray/dag/tests/test_class_dag.py @@ -147,6 +147,13 @@ def combine(x, y): .get("name") == "a2_v0" ) + # refer to actor method a2.inc.options() call + assert ( + test_a2.get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY] + .get_options() + .get("name") + == "v3" + ) # refer to a1 constructor .options() call assert ( test_a1.get_other_args_to_resolve()[PARENT_CLASS_NODE_KEY] @@ -154,6 +161,21 @@ def combine(x, y): .get("name") == "a1_v1" ) + # refer to latest actor method a1.inc.options() call + assert ( + test_a1.get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY] + .get_options() + .get("name") + == "v2" + ) + # refer to first bound actor method a1.inc.options() call + assert ( + test_a1.get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY] + .get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY] + .get_options() + .get("name") + == "v1" + ) def test_pass_actor_handle(shared_ray_instance): From 00f3f1cdb71007230d517dee786d20b3e5f2538a Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 12 Dec 2023 10:43:37 -0800 Subject: [PATCH 58/66] x Signed-off-by: Stephanie Wang --- python/ray/actor.py | 5 ++-- python/ray/dag/__init__.py | 2 ++ python/ray/dag/class_node.py | 9 ++++++ python/ray/dag/compiled_dag_node.py | 31 ++++++++++++-------- python/ray/dag/dag_node.py | 7 +---- python/ray/dag/tests/test_accelerated_dag.py | 2 +- 6 files changed, 33 insertions(+), 23 deletions(-) diff --git a/python/ray/actor.py b/python/ray/actor.py index e901ed414c356..4560d72c03de9 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -29,7 +29,6 @@ PythonFunctionDescriptor, raise_sys_exit_with_custom_error_message, ) -from ray.dag.class_node import PARENT_CLASS_NODE_KEY, ClassMethodNode from ray.exceptions import AsyncioActorExit from ray.util.annotations import DeveloperAPI, PublicAPI from ray.util.placement_group import _configure_placement_group_based_on_context @@ -152,7 +151,7 @@ def __init__( decorator=None, hardref=False, ): - self._actor_ref = weakref.proxy(actor) + self._actor_ref = weakref.ref(actor) self._method_name = method_name self._num_returns = num_returns @@ -319,7 +318,7 @@ def invocation(args, kwargs): def __getstate__(self): return { - "actor": self._actor_ref, + "actor": self._actor_ref(), "method_name": self._method_name, "num_returns": self._num_returns, "max_retries": self._max_retries, diff --git a/python/ray/dag/__init__.py b/python/ray/dag/__init__.py index 726d5930b17d0..e74f57245c097 100644 --- a/python/ray/dag/__init__.py +++ b/python/ray/dag/__init__.py @@ -9,6 +9,7 @@ from ray.dag.output_node import MultiOutputNode from ray.dag.constants import ( PARENT_CLASS_NODE_KEY, + PREV_CLASS_METHOD_CALL_KEY, DAGNODE_TYPE_KEY, ) from ray.dag.vis_utils import plot @@ -22,6 +23,7 @@ "InputAttributeNode", "DAGInputData", "PARENT_CLASS_NODE_KEY", + "PREV_CLASS_METHOD_CALL_KEY", "DAGNODE_TYPE_KEY", "plot", "MultiOutputNode", diff --git a/python/ray/dag/class_node.py b/python/ray/dag/class_node.py index e4a74f83c7495..84a2f9dac1963 100644 --- a/python/ray/dag/class_node.py +++ b/python/ray/dag/class_node.py @@ -202,3 +202,12 @@ def __str__(self) -> str: def get_method_name(self) -> str: return self._method_name + + def _get_remote_method(self, method_name): + method_body = getattr(self._parent_class_node, method_name) + return method_body + + def _get_actor_handle(self) -> Optional[ReferenceType["ray.actor.ActorHandle"]]: + if not isinstance(self._parent_class_node, ray.actor.ActorHandle): + return None + return self._parent_class_node diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index 17f6362b6bfd7..dac78303f1ee3 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -92,14 +92,14 @@ def __init__(self, idx: int, dag_node: "ray.dag.DAGNode"): self.output_channel = None @property - def args(self): + def args(self) -> Tuple[Any]: return self.dag_node.get_args() @property - def num_readers(self): + def num_readers(self) -> int: return len(self.downstream_node_idxs) - def __str__(self): + def __str__(self) -> str: return f""" Node: {self.dag_node} Arguments: {self.args} @@ -108,6 +108,11 @@ def __str__(self): class CompiledDAG: + """Experimental class for accelerated execution. + + See REP https://github.com/ray-project/enhancements/pull/48 for more + information. + """ def __init__(self): # idx -> CompiledTask. self.idx_to_task = {} @@ -165,13 +170,13 @@ def _preprocess(self) -> None: ): if isinstance(dag_node, InputAttributeNode): # TODO(swang): Support multi args. - raise ValueError( + raise NotImplementedError( "Compiled DAGs currently do not support kwargs or " "multiple args for InputNode" ) elif isinstance(dag_node, FunctionNode): # TODO(swang): Support non-actor tasks. - raise ValueError( + raise NotImplementedError( "Compiled DAGs currently only support actor method nodes" ) else: @@ -195,7 +200,7 @@ def _preprocess(self) -> None: for actor_id, task_count in self.actor_task_count.items(): if task_count > 1: - raise ValueError( + raise NotImplementedError( "Compiled DAGs can contain at most one task per actor handle. " f"Actor with ID {actor_id} appears {task_count}x." ) @@ -207,7 +212,7 @@ def _preprocess(self) -> None: self.input_task_idx = idx # TODO: Support no-input DAGs (use an empty object to signal). if self.input_task_idx is None: - raise ValueError("Compiled DAGs currently require exactly one InputNode") + raise NotImplementedError("Compiled DAGs currently require exactly one InputNode") # Find the (multi-)output node to the DAG. for idx, task in self.idx_to_task.items(): @@ -227,7 +232,7 @@ def _preprocess(self) -> None: # now. self._preprocess() - def _compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]: + def _get_or_compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]: """Compile an execution path. This allocates channels for adjacent tasks to send/receive values. An infinite task is submitted to each actor in the DAG that repeatedly receives from input channel(s) and @@ -363,16 +368,16 @@ def execute( # These errors should already be caught during compilation, but just in # case. if len(args) != 1: - raise ValueError("Compiled DAGs support exactly one InputNode arg") + raise NotImplementedError("Compiled DAGs support exactly one InputNode arg") if len(kwargs) != 0: - raise ValueError("Compiled DAGs do not support kwargs") + raise NotImplementedError("Compiled DAGs do not support kwargs") - input_channel, output_channels = self._compile() + input_channel, output_channels = self._get_or_compile() input_channel.write(args[0]) return output_channels -def build_compiled_dag_from_ray_dag(dag: "ray.dag.DAGNode"): +def build_compiled_dag_from_ray_dag(dag: "ray.dag.DAGNode") -> "CompiledDAG": compiled_dag = CompiledDAG() def _build_compiled_dag(node): @@ -380,5 +385,5 @@ def _build_compiled_dag(node): return node dag.apply_recursive(_build_compiled_dag) - compiled_dag._compile() + compiled_dag._get_or_compile() return compiled_dag diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py index 6d15889eea8c2..7da478de68f32 100644 --- a/python/ray/dag/dag_node.py +++ b/python/ray/dag/dag_node.py @@ -61,8 +61,6 @@ def __init__( # Cached values from last call to execute() self.cache_from_last_execute = {} - self._compiled_dag = None - def get_args(self) -> Tuple[Any]: """Return the tuple of arguments for this node.""" @@ -111,10 +109,7 @@ def experimental_compile(self) -> "ray.dag.CompiledDAG": """Compile an accelerated execution path for this DAG. The compiled DAG is cached. """ - if self._compiled_dag is None: - self._compiled_dag = build_compiled_dag_from_ray_dag(self) - - return self._compiled_dag + return build_compiled_dag_from_ray_dag(self) def execute( self, *args, _ray_cache_refs: bool = False, **kwargs diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py index 13a90ceef3b13..0687607d93348 100644 --- a/python/ray/dag/tests/test_accelerated_dag.py +++ b/python/ray/dag/tests/test_accelerated_dag.py @@ -15,7 +15,7 @@ logger = logging.getLogger(__name__) -@ray.remote(concurrency_groups={"_ray_system": 1}) +@ray.remote class Actor: def __init__(self, init_value): print("__init__ PID", os.getpid()) From 257457d5ff09ebf8d01231ad6fe0c84ed0b8daa1 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 12 Dec 2023 10:52:55 -0800 Subject: [PATCH 59/66] buffer size bytes Signed-off-by: Stephanie Wang --- python/ray/dag/compiled_dag_node.py | 37 +++++++++++++++++------------ python/ray/dag/dag_node.py | 14 +++++++---- 2 files changed, 32 insertions(+), 19 deletions(-) diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index dac78303f1ee3..ebc3d6611f764 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -15,7 +15,7 @@ def do_allocate_channel( - self, buffer_size_bytes: int = MAX_BUFFER_SIZE, num_readers: int = 1 + self, buffer_size_bytes: int, num_readers: int = 1 ) -> ChannelType: """Generic actor method to allocate an output channel. @@ -113,27 +113,33 @@ class CompiledDAG: See REP https://github.com/ray-project/enhancements/pull/48 for more information. """ - def __init__(self): + def __init__(self, buffer_size_bytes: Optional[int]): + self._buffer_size_bytes : Optional[int] = buffer_size_bytes + if self._buffer_size_bytes is None: + self._buffer_size_bytes = MAX_BUFFER_SIZE + if not isinstance(self._buffer_size_bytes, int) or self._buffer_size_bytes <= 0: + raise ValueError(f"`buffer_size_bytes` must be a positive integer, found {self._buffer_size_bytes}") + # idx -> CompiledTask. - self.idx_to_task = {} + self.idx_to_task : Dict[int, "CompiledTask"] = {} # DAGNode -> idx. - self.dag_node_to_idx = {} + self.dag_node_to_idx : Dict["ray.dag.DAGNode", int] = {} # idx counter. - self.counter = 0 + self.counter : int = 0 # Attributes that are set during preprocessing. # Preprocessing identifies the input node and output node. - self.input_task_idx = None - self.output_task_idx = None - self.has_single_output = False - self.actor_task_count = defaultdict(int) + self.input_task_idx : Optional[int] = None + self.output_task_idx : Optional[int] = None + self.has_single_output : bool = False + self.actor_task_count : Dict["ray._raylet.ActorID", int] = defaultdict(int) # Cached attributes that are set during compilation. - self.dag_input_channel = None - self.dag_output_channels = None + self.dag_input_channel : Optional[ChannelType] = None + self.dag_output_channels : Optional[ChannelType] = None # ObjectRef for each worker's task. The task is an infinite loop that # repeatedly executes the method specified in the DAG. - self.worker_task_refs = [] + self.worker_task_refs : List["ray.ObjectRef"] = [] def _add_node(self, node: "ray.dag.DAGNode") -> None: idx = self.counter @@ -277,12 +283,13 @@ def _get_or_compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelT task.output_channel = ray.get( fn.remote( do_allocate_channel, + buffer_size_bytes=self._buffer_size_bytes, num_readers=task.num_readers, ) ) elif isinstance(task.dag_node, InputNode): task.output_channel = ray_channel.Channel( - buffer_size_bytes=MAX_BUFFER_SIZE, num_readers=task.num_readers + buffer_size_bytes=self._buffer_size_bytes, num_readers=task.num_readers ) else: assert isinstance(task.dag_node, OutputNode) @@ -377,8 +384,8 @@ def execute( return output_channels -def build_compiled_dag_from_ray_dag(dag: "ray.dag.DAGNode") -> "CompiledDAG": - compiled_dag = CompiledDAG() +def build_compiled_dag_from_ray_dag(dag: "ray.dag.DAGNode", buffer_size_bytes: Optional[int]) -> "CompiledDAG": + compiled_dag = CompiledDAG(buffer_size_bytes) def _build_compiled_dag(node): compiled_dag._add_node(node) diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py index 7da478de68f32..50136e6d855d5 100644 --- a/python/ray/dag/dag_node.py +++ b/python/ray/dag/dag_node.py @@ -105,11 +105,17 @@ async def get_object_refs_from_last_execute(self) -> Dict[str, Any]: def clear_cache(self): self.cache_from_last_execute = {} - def experimental_compile(self) -> "ray.dag.CompiledDAG": - """Compile an accelerated execution path for this DAG. The compiled DAG - is cached. + def experimental_compile(self, buffer_size_bytes: Optional[int] = None) -> "ray.dag.CompiledDAG": + """Compile an accelerated execution path for this DAG. + + Args: + buffer_size_bytes: The maximum size of messages that can be passed + between tasks in the DAG. + + Returns: + A compiled DAG. """ - return build_compiled_dag_from_ray_dag(self) + return build_compiled_dag_from_ray_dag(self, buffer_size_bytes) def execute( self, *args, _ray_cache_refs: bool = False, **kwargs From 71c32ae659e89d20de87a97e122d5e23bbad5d9a Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 12 Dec 2023 19:14:26 -0800 Subject: [PATCH 60/66] optional Signed-off-by: Stephanie Wang --- python/ray/dag/compiled_dag_node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index ebc3d6611f764..3abb813ebe1f3 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -1,5 +1,5 @@ import logging -from typing import Any, List, Tuple, Union +from typing import Any, List, Tuple, Union, Optional from collections import defaultdict From 2ba93f0f5041d853714d3beae92eb23c1b4c213f Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 12 Dec 2023 21:20:49 -0800 Subject: [PATCH 61/66] x Signed-off-by: Stephanie Wang --- python/ray/dag/compiled_dag_node.py | 61 +++++++++++++++++------------ python/ray/dag/dag_node.py | 4 +- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index 3abb813ebe1f3..82d0a3901b546 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -1,5 +1,5 @@ import logging -from typing import Any, List, Tuple, Union, Optional +from typing import Any, Dict, List, Tuple, Union, Optional from collections import defaultdict @@ -113,33 +113,37 @@ class CompiledDAG: See REP https://github.com/ray-project/enhancements/pull/48 for more information. """ + def __init__(self, buffer_size_bytes: Optional[int]): - self._buffer_size_bytes : Optional[int] = buffer_size_bytes + self._buffer_size_bytes: Optional[int] = buffer_size_bytes if self._buffer_size_bytes is None: self._buffer_size_bytes = MAX_BUFFER_SIZE if not isinstance(self._buffer_size_bytes, int) or self._buffer_size_bytes <= 0: - raise ValueError(f"`buffer_size_bytes` must be a positive integer, found {self._buffer_size_bytes}") + raise ValueError( + "`buffer_size_bytes` must be a positive integer, found " + f"{self._buffer_size_bytes}" + ) # idx -> CompiledTask. - self.idx_to_task : Dict[int, "CompiledTask"] = {} + self.idx_to_task: Dict[int, "CompiledTask"] = {} # DAGNode -> idx. - self.dag_node_to_idx : Dict["ray.dag.DAGNode", int] = {} + self.dag_node_to_idx: Dict["ray.dag.DAGNode", int] = {} # idx counter. - self.counter : int = 0 + self.counter: int = 0 # Attributes that are set during preprocessing. # Preprocessing identifies the input node and output node. - self.input_task_idx : Optional[int] = None - self.output_task_idx : Optional[int] = None - self.has_single_output : bool = False - self.actor_task_count : Dict["ray._raylet.ActorID", int] = defaultdict(int) + self.input_task_idx: Optional[int] = None + self.output_task_idx: Optional[int] = None + self.has_single_output: bool = False + self.actor_task_count: Dict["ray._raylet.ActorID", int] = defaultdict(int) # Cached attributes that are set during compilation. - self.dag_input_channel : Optional[ChannelType] = None - self.dag_output_channels : Optional[ChannelType] = None + self.dag_input_channel: Optional[ChannelType] = None + self.dag_output_channels: Optional[ChannelType] = None # ObjectRef for each worker's task. The task is an infinite loop that # repeatedly executes the method specified in the DAG. - self.worker_task_refs : List["ray.ObjectRef"] = [] + self.worker_task_refs: List["ray.ObjectRef"] = [] def _add_node(self, node: "ray.dag.DAGNode") -> None: idx = self.counter @@ -160,7 +164,7 @@ def _preprocess(self) -> None: FunctionNode, InputAttributeNode, InputNode, - OutputNode, + MultiOutputNode, ) self.input_task_idx, self.output_task_idx = None, None @@ -171,7 +175,7 @@ def _preprocess(self) -> None: dag_node = task.dag_node if not ( isinstance(dag_node, InputNode) - or isinstance(dag_node, OutputNode) + or isinstance(dag_node, MultiOutputNode) or isinstance(dag_node, ClassMethodNode) ): if isinstance(dag_node, InputAttributeNode): @@ -218,7 +222,9 @@ def _preprocess(self) -> None: self.input_task_idx = idx # TODO: Support no-input DAGs (use an empty object to signal). if self.input_task_idx is None: - raise NotImplementedError("Compiled DAGs currently require exactly one InputNode") + raise NotImplementedError( + "Compiled DAGs currently require exactly one InputNode" + ) # Find the (multi-)output node to the DAG. for idx, task in self.idx_to_task.items(): @@ -228,17 +234,19 @@ def _preprocess(self) -> None: assert self.output_task_idx is not None output_node = self.idx_to_task[self.output_task_idx].dag_node - # Add an OutputNode to the end of the DAG if it's not already there. - if not isinstance(output_node, OutputNode): + # Add an MultiOutputNode to the end of the DAG if it's not already there. + if not isinstance(output_node, MultiOutputNode): self.has_single_output = True - output_node = OutputNode([output_node]) + output_node = MultiOutputNode([output_node]) self._add_node(output_node) self.output_task_idx = self.dag_node_to_idx[output_node] # Preprocess one more time so that we have the right output node # now. self._preprocess() - def _get_or_compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]: + def _get_or_compile( + self, + ) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]: """Compile an execution path. This allocates channels for adjacent tasks to send/receive values. An infinite task is submitted to each actor in the DAG that repeatedly receives from input channel(s) and @@ -253,7 +261,7 @@ def _get_or_compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelT output channel(s) should be read by the caller to get the DAG output. """ - from ray.dag import DAGNode, InputNode, OutputNode, ClassMethodNode + from ray.dag import DAGNode, InputNode, MultiOutputNode, ClassMethodNode if self.input_task_idx is None: self._preprocess() @@ -289,10 +297,11 @@ def _get_or_compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelT ) elif isinstance(task.dag_node, InputNode): task.output_channel = ray_channel.Channel( - buffer_size_bytes=self._buffer_size_bytes, num_readers=task.num_readers + buffer_size_bytes=self._buffer_size_bytes, + num_readers=task.num_readers, ) else: - assert isinstance(task.dag_node, OutputNode) + assert isinstance(task.dag_node, MultiOutputNode) for idx in task.downstream_node_idxs: queue.append(idx) @@ -348,7 +357,7 @@ def _get_or_compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelT assert [ output_channel is not None for output_channel in self.dag_output_channels ] - # If no OutputNode was specified during the DAG creation, there is only + # If no MultiOutputNode was specified during the DAG creation, there is only # one output. Return a single output channel instead of a list of # channels. if self.has_single_output: @@ -384,7 +393,9 @@ def execute( return output_channels -def build_compiled_dag_from_ray_dag(dag: "ray.dag.DAGNode", buffer_size_bytes: Optional[int]) -> "CompiledDAG": +def build_compiled_dag_from_ray_dag( + dag: "ray.dag.DAGNode", buffer_size_bytes: Optional[int] +) -> "CompiledDAG": compiled_dag = CompiledDAG(buffer_size_bytes) def _build_compiled_dag(node): diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py index 50136e6d855d5..5acfbd02f92cf 100644 --- a/python/ray/dag/dag_node.py +++ b/python/ray/dag/dag_node.py @@ -105,7 +105,9 @@ async def get_object_refs_from_last_execute(self) -> Dict[str, Any]: def clear_cache(self): self.cache_from_last_execute = {} - def experimental_compile(self, buffer_size_bytes: Optional[int] = None) -> "ray.dag.CompiledDAG": + def experimental_compile( + self, buffer_size_bytes: Optional[int] = None + ) -> "ray.dag.CompiledDAG": """Compile an accelerated execution path for this DAG. Args: From ac5fa5563fa604e46e84299fc6cc5dd9f6b31d77 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 12 Dec 2023 21:32:13 -0800 Subject: [PATCH 62/66] x Signed-off-by: Stephanie Wang --- python/ray/dag/tests/test_accelerated_dag.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py index 0687607d93348..b293ee8940267 100644 --- a/python/ray/dag/tests/test_accelerated_dag.py +++ b/python/ray/dag/tests/test_accelerated_dag.py @@ -14,6 +14,9 @@ logger = logging.getLogger(__name__) +if sys.platform != "linux": + pytest.skip("Skipping, requires Linux.", allow_module_level=True) + @ray.remote class Actor: @@ -118,7 +121,8 @@ def test_dag_errors(ray_start_regular): a = Actor.remote(0) dag = a.inc.bind(1) with pytest.raises( - ValueError, match="Compiled DAGs currently require exactly one InputNode" + NotImplementedError, + match="Compiled DAGs currently require exactly one InputNode", ): dag.experimental_compile() @@ -139,7 +143,8 @@ def f(x): with InputNode() as inp: dag = f.bind(inp) with pytest.raises( - ValueError, match="Compiled DAGs currently only support actor method nodes" + NotImplementedError, + match="Compiled DAGs currently only support actor method nodes", ): dag.experimental_compile() @@ -147,14 +152,15 @@ def f(x): dag = a.inc.bind(inp) dag = a.inc.bind(dag) with pytest.raises( - ValueError, match="Compiled DAGs can contain at most one task per actor handle." + NotImplementedError, + match="Compiled DAGs can contain at most one task per actor handle.", ): dag.experimental_compile() with InputNode() as inp: dag = a.inc_two.bind(inp[0], inp[1]) with pytest.raises( - ValueError, + NotImplementedError, match="Compiled DAGs currently do not support kwargs or multiple args " "for InputNode", ): @@ -163,7 +169,7 @@ def f(x): with InputNode() as inp: dag = a.inc_two.bind(inp.x, inp.y) with pytest.raises( - ValueError, + NotImplementedError, match="Compiled DAGs currently do not support kwargs or multiple args " "for InputNode", ): From 35a37fdb3cda3bdcdb48987b768f34cfb5fc6535 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 13 Dec 2023 10:43:57 -0800 Subject: [PATCH 63/66] lint? Signed-off-by: Stephanie Wang --- python/ray/dag/class_node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/dag/class_node.py b/python/ray/dag/class_node.py index 84a2f9dac1963..64f4615e1aada 100644 --- a/python/ray/dag/class_node.py +++ b/python/ray/dag/class_node.py @@ -207,7 +207,7 @@ def _get_remote_method(self, method_name): method_body = getattr(self._parent_class_node, method_name) return method_body - def _get_actor_handle(self) -> Optional[ReferenceType["ray.actor.ActorHandle"]]: + def _get_actor_handle(self) -> Optional["ray.actor.ActorHandle"]: if not isinstance(self._parent_class_node, ray.actor.ActorHandle): return None return self._parent_class_node From 13263318685aa43a9e1e4e91631b7233410d870f Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 13 Dec 2023 13:24:22 -0800 Subject: [PATCH 64/66] test Signed-off-by: Stephanie Wang --- python/ray/dag/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/dag/BUILD b/python/ray/dag/BUILD index 164791d0fcd2e..4cb5e5634a13b 100644 --- a/python/ray/dag/BUILD +++ b/python/ray/dag/BUILD @@ -71,7 +71,7 @@ py_test( py_test( name = "test_accelerated_dag", - size = "small", + size = "medium", srcs = dag_tests_srcs, tags = ["exclusive", "team:core", "ray_dag_tests"], deps = [":dag_lib"], From fadec0788ebdf4d91e774ee4b96875b8ad0a2a49 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 13 Dec 2023 13:48:19 -0800 Subject: [PATCH 65/66] API Signed-off-by: Stephanie Wang --- python/ray/dag/compiled_dag_node.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index 82d0a3901b546..bccf77389b9f1 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -14,6 +14,7 @@ logger = logging.getLogger(__name__) +@DeveloperAPI def do_allocate_channel( self, buffer_size_bytes: int, num_readers: int = 1 ) -> ChannelType: @@ -30,6 +31,7 @@ def do_allocate_channel( return self._output_channel +@DeveloperAPI def do_exec_compiled_task( self, inputs: List[Union[Any, ChannelType]], @@ -76,6 +78,7 @@ def do_exec_compiled_task( raise +@DeveloperAPI class CompiledTask: """Wraps the normal Ray DAGNode with some metadata.""" @@ -107,9 +110,13 @@ def __str__(self) -> str: """ +@DeveloperAPI class CompiledDAG: """Experimental class for accelerated execution. + This class should not be called directly. Instead, create + a ray.dag and call experimental_compile(). + See REP https://github.com/ray-project/enhancements/pull/48 for more information. """ @@ -393,6 +400,7 @@ def execute( return output_channels +@DeveloperAPI def build_compiled_dag_from_ray_dag( dag: "ray.dag.DAGNode", buffer_size_bytes: Optional[int] ) -> "CompiledDAG": From ff19557067e7528a88d8a10cb5bbe10ea852cae0 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Wed, 13 Dec 2023 15:05:31 -0800 Subject: [PATCH 66/66] x Signed-off-by: Stephanie Wang --- python/ray/dag/compiled_dag_node.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index bccf77389b9f1..4e86b5686f531 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -5,6 +5,7 @@ import ray import ray.experimental.channel as ray_channel +from ray.util.annotations import DeveloperAPI MAX_BUFFER_SIZE = int(100 * 1e6) # 100MB