From 304424737ab6d9d92d337c93f6fc41ceb0639934 Mon Sep 17 00:00:00 2001
From: SangBin Cho <sangcho@sangcho-LT93GQWG9C.local>
Date: Fri, 17 Nov 2023 08:37:15 +0900
Subject: [PATCH 01/66] ip

---
 SANGREADME.md                                |  66 ++++++++++++
 a.py                                         |  30 ++++++
 python/ray/dag/__init__.py                   |   2 +
 python/ray/dag/dag_node.py                   |   2 +-
 python/ray/dag/output_node.py                |  49 +++++++++
 python/ray/dag/tests/test_accelerator_dag.py | 102 +++++++++++++++++++
 python/ray/dag/utils.py                      |   3 +
 python/ray/serve/tests/common/test_dags.py   |   1 +
 8 files changed, 254 insertions(+), 1 deletion(-)
 create mode 100644 SANGREADME.md
 create mode 100644 a.py
 create mode 100644 python/ray/dag/output_node.py
 create mode 100644 python/ray/dag/tests/test_accelerator_dag.py

diff --git a/SANGREADME.md b/SANGREADME.md
new file mode 100644
index 0000000000000..f7283cb495891
--- /dev/null
+++ b/SANGREADME.md
@@ -0,0 +1,66 @@
+Actor.bind would kill actors unless I cache the refs. We should fix it.
+When actor calls are binded with actor.method.bind, it doesn't create a new DAG, but it append binded methods to existing DAG. 
+
+Worker -> method1
+       -> method 2
+
+Instead of 2 dags with
+
+method1
+method 2
+
+Only 1 input node is possible with current DAG API.
+
+Serve: Got around the first issue because all actors are detached.
+Not sure how it got around the second case. Maybe it never need to handle this case. 
+
+Example:
+
+worker = Worker.bind()
+dag = worker.method.bind()
+dag2 = worker.method_2.bind()
+
+This will become
+
+worker -> method -> method2
+
+not 
+
+worker -> method
+worker -> method_2
+
+
+VLLM
+
+init_worker
+init torch distributed
+init_model
+profile_num_available_blocks
+init_cache_engine
+
+forward
+
+Q:
+- How much existing DAG will be used? Are we going to implement our own DAG APIs? (I believe so?)
+- What's the work needed to make .remote work with actors?
+    - Is actor creation supposed to be a part of DAG?
+- How the current shared memory based transport feature will be exposed to API?
+- How do we handle different size input for different object ref? (the remaining bytes are just becoming garbages?)
+- e2e flow
+    - InputNode creates the first buffer (object_ref) that could be reused.
+    - Each bind method reuses the buffer.
+    - If actor is reused.
+        - Use the first buffer created? We can only have 1 input node anyway now.
+- Iterable DAG -> is it just a repeat of execute?
+
+TODO
+- Curerntly, any bind from actor will become a huge single DAG starting from actor. 
+    - Need to find a way to exclude ClassNode from DAG execution. 
+- Only one input node is possible for a single actor. But input node can have multiple inputs
+    - Maybe we should allow multiple input node for a single actor (and use it as a starting point).
+    - Not needed now.
+- No way to keep the actor alive.
+    - There's private argument _ray_cache_ref, but it will cache all refs which is not desirable.
+    - New API in the part of bind.
+
+1 DAG can only have 1 input Node
diff --git a/a.py b/a.py
new file mode 100644
index 0000000000000..fe346dda46c8a
--- /dev/null
+++ b/a.py
@@ -0,0 +1,30 @@
+import ray
+from ray.dag.vis_utils import plot
+ray.init()
+
+from ray.dag.input_node import InputNode
+
+@ray.remote
+def a(user_input):
+    return user_input * 2
+
+@ray.remote
+def b(user_input):
+    return user_input + 1
+
+@ray.remote
+def c(x, y):
+    return x + y
+
+with InputNode() as dag_input:
+    a_ref = a.bind(dag_input)
+    b_ref = b.bind(dag_input)
+    dag = c.bind(a_ref, b_ref)
+
+#   a(2)  +   b(2)  = c
+# (2 * 2) + (2 * 1)
+assert ray.get(dag.execute(2)) == 7
+
+#   a(3)  +   b(3)  = c
+# (3 * 2) + (3 * 1)
+assert ray.get(dag.execute(3)) == 10
diff --git a/python/ray/dag/__init__.py b/python/ray/dag/__init__.py
index 109b09f125946..985db41be0732 100644
--- a/python/ray/dag/__init__.py
+++ b/python/ray/dag/__init__.py
@@ -6,6 +6,7 @@
     InputAttributeNode,
     DAGInputData,
 )
+from ray.dag.output_node import OutputNode
 from ray.dag.constants import (
     PARENT_CLASS_NODE_KEY,
     PREV_CLASS_METHOD_CALL_KEY,
@@ -25,4 +26,5 @@
     "PREV_CLASS_METHOD_CALL_KEY",
     "DAGNODE_TYPE_KEY",
     "plot",
+    "OutputNode",
 ]
diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py
index 6408f92de15fd..6c20db8c15124 100644
--- a/python/ray/dag/dag_node.py
+++ b/python/ray/dag/dag_node.py
@@ -294,7 +294,7 @@ def apply_functional(
 
         return replaced_inputs
 
-    def _execute_impl(self) -> Union[ray.ObjectRef, ray.actor.ActorHandle]:
+    def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, ray.actor.ActorHandle]:
         """Execute this node, assuming args have been transformed already."""
         raise NotImplementedError
 
diff --git a/python/ray/dag/output_node.py b/python/ray/dag/output_node.py
new file mode 100644
index 0000000000000..c98440a274d62
--- /dev/null
+++ b/python/ray/dag/output_node.py
@@ -0,0 +1,49 @@
+import ray
+from typing import Any, Dict, List, Union, Tuple
+
+from ray.dag import DAGNode
+from ray.dag.format_utils import get_dag_node_str
+from ray.experimental.gradio_utils import type_to_string
+from ray.util.annotations import Deprecated
+
+IN_CONTEXT_MANAGER = "__in_context_manager__"
+
+
+class OutputNode(DAGNode):
+    r"""Ray dag node used in DAG building API to mark the endpoint of DAG
+    """
+
+    def __init__(
+        self,
+        args: Union[DAGNode, List[DAGNode], Tuple[DAGNode]],
+        other_args_to_resolve: Dict[str, Any] = None,
+    ):
+        if isinstance(args, tuple):
+            args = list(args)
+        if not isinstance(args, list):
+            args = (args,)
+        super().__init__(
+            args,
+            {},
+            {},
+            other_args_to_resolve=other_args_to_resolve or {},
+        )
+
+    def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, ray.actor.ActorHandle]:
+        if len(self._bound_args) == 1:
+            return self._bound_args[0]
+        else:
+            return self._bound_args
+
+    def _copy_impl(
+        self,
+        new_args: List[Any],
+        new_kwargs: Dict[str, Any],
+        new_options: Dict[str, Any],
+        new_other_args_to_resolve: Dict[str, Any],
+    ) -> "DAGNode":
+        """Return a copy of this node with the given new args."""
+        return OutputNode(new_args, new_other_args_to_resolve)
+    
+    def __str__(self) -> str:
+        return get_dag_node_str(self, "__OutputNode__")
diff --git a/python/ray/dag/tests/test_accelerator_dag.py b/python/ray/dag/tests/test_accelerator_dag.py
new file mode 100644
index 0000000000000..cf32dcd00852e
--- /dev/null
+++ b/python/ray/dag/tests/test_accelerator_dag.py
@@ -0,0 +1,102 @@
+import pytest
+
+import ray
+from ray.dag.input_node import InputNode
+from ray.dag.output_node import OutputNode
+from ray.dag import (
+    PARENT_CLASS_NODE_KEY,
+    PREV_CLASS_METHOD_CALL_KEY,
+)
+from ray.dag.vis_utils import plot
+
+def test_output_node(shared_ray_instance):
+    @ray.remote
+    def f(input):
+        return input
+
+    with InputNode() as input_data:
+        dag = OutputNode(f.bind(input_data))
+    
+    assert ray.get(dag.execute(1)) == 1
+    assert ray.get(dag.execute(2)) == 2
+
+    with InputNode() as input_data:
+        dag = OutputNode([f.bind(input_data["x"]), f.bind(input_data["y"])])
+    
+    refs = dag.execute({"x": 1, "y": 2})
+    assert len(refs) == 2
+    assert ray.get(refs) == [1, 2]
+
+    with InputNode() as input_data:
+        dag = OutputNode([
+            f.bind(input_data["x"]),
+            f.bind(input_data["y"]),
+            f.bind(input_data["x"])
+        ])
+    
+    refs = dag.execute({"x": 1, "y": 2})
+    assert len(refs) == 3
+    assert ray.get(refs) == [1, 2, 1]
+
+
+def test_a(shared_ray_instance):
+    @ray.remote
+    class Worker:
+        def __init__(self):
+            pass
+
+        def forward(self, input):
+            print("forward")
+
+        def initialize(self, input):
+            print("initialize")
+
+    worker = Worker.bind()
+    with InputNode() as input_node:
+        dag1 = worker.initialize.bind(input_node)
+    with InputNode() as input_node:
+        dag2 = worker.forward.bind(input_node)
+
+    print(ray.get(dag2.execute(1)))
+
+    # plot(dag1, to_file="a.png")
+    # plot(dag2, to_file="b.png")
+
+
+def test_tensor_parallel_dag(shared_ray_instance):
+    @ray.remote
+    class Worker:
+        def __init__(self, rank):
+            self.rank = rank
+
+        def forward(self, input_data: int):
+            print(input_data)
+            return self.rank + input_data
+
+        def initialize(self):
+            pass
+
+    with InputNode() as input_data:
+        workers = [Worker.bind(i) for i in range(4)]
+        dag = OutputNode(
+            [worker.forward.bind(input_data) for worker in workers])
+        init_dag = OutputNode(
+            [worker.initialize.bind() for worker in workers])
+
+    # for _ in range(1):
+    #     refs = dag.execute(2, _ray_cache_refs=True)
+    #     assert len(refs) == 4
+    #     all_outputs = ray.get(refs)
+    #     assert all_outputs == [2, 3, 4, 5]
+
+    plot(init_dag, to_file="a.png")
+    plot(dag, to_file="b.png")
+    # ray.get(init_dag.execute(_ray_cache_refs=True))
+    import time
+    time.sleep(30)
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(["-v", __file__]))
diff --git a/python/ray/dag/utils.py b/python/ray/dag/utils.py
index fe9ae35168806..3da8570027dbd 100644
--- a/python/ray/dag/utils.py
+++ b/python/ray/dag/utils.py
@@ -7,6 +7,7 @@
     FunctionNode,
     ClassNode,
     ClassMethodNode,
+    OutputNode,
 )
 
 
@@ -22,6 +23,8 @@ def __init__(self):
     def get_node_name(self, node: DAGNode):
         # InputNode should be unique.
         if isinstance(node, InputNode):
+            return "OUTPUT_NODE"
+        if isinstance(node, OutputNode):
             return "INPUT_NODE"
         # InputAttributeNode suffixes should match the user-defined key.
         elif isinstance(node, InputAttributeNode):
diff --git a/python/ray/serve/tests/common/test_dags.py b/python/ray/serve/tests/common/test_dags.py
index ddccf41c5e3a9..8bd7bcbc78912 100644
--- a/python/ray/serve/tests/common/test_dags.py
+++ b/python/ray/serve/tests/common/test_dags.py
@@ -60,3 +60,4 @@ def get_multi_instantiation_class_nested_deployment_arg_dag():
         ray_dag = combine.__call__.bind(dag_input)
 
     return ray_dag, dag_input
+

From 8c5efd8ac3ad421b3eabfe94c1cf69942a9d7502 Mon Sep 17 00:00:00 2001
From: SangBin Cho <sangcho@sangcho-LT93GQWG9C.local>
Date: Fri, 17 Nov 2023 15:52:25 +0900
Subject: [PATCH 02/66] basic working.

---
 SANGREADME.md                                |  7 +-
 python/ray/actor.py                          | 46 +++++++++++-
 python/ray/dag/__init__.py                   |  2 -
 python/ray/dag/class_node.py                 | 33 ++++-----
 python/ray/dag/constants.py                  |  1 -
 python/ray/dag/dag_node.py                   |  4 +-
 python/ray/dag/output_node.py                |  2 +-
 python/ray/dag/tests/test_accelerator_dag.py | 74 +++++++++++++-------
 python/ray/dag/tests/test_class_dag.py       | 27 +------
 python/ray/dag/utils.py                      |  4 +-
 10 files changed, 113 insertions(+), 87 deletions(-)

diff --git a/SANGREADME.md b/SANGREADME.md
index f7283cb495891..4f3a740a0ea72 100644
--- a/SANGREADME.md
+++ b/SANGREADME.md
@@ -54,12 +54,11 @@ Q:
 - Iterable DAG -> is it just a repeat of execute?
 
 TODO
-- Curerntly, any bind from actor will become a huge single DAG starting from actor. 
+- [done] Curerntly, any bind from actor will become a huge single DAG starting from actor. 
     - Need to find a way to exclude ClassNode from DAG execution. 
-- Only one input node is possible for a single actor. But input node can have multiple inputs
+- [done] Only one input node is possible for a single actor. But input node can have multiple inputs
     - Maybe we should allow multiple input node for a single actor (and use it as a starting point).
-    - Not needed now.
-- No way to keep the actor alive.
+- [done] No way to keep the actor alive.
     - There's private argument _ray_cache_ref, but it will cache all refs which is not desirable.
     - New API in the part of bind.
 
diff --git a/python/ray/actor.py b/python/ray/actor.py
index a6adfd5e862a7..3019cbe47ca53 100644
--- a/python/ray/actor.py
+++ b/python/ray/actor.py
@@ -7,6 +7,7 @@
 import ray._private.signature as signature
 import ray._private.worker
 import ray._raylet
+from ray.dag.class_node import ClassMethodNode, PARENT_CLASS_NODE_KEY
 from ray import ActorClassID, Language, cross_language
 from ray._private import ray_option_utils
 from ray._private.async_compat import is_async_func
@@ -136,7 +137,7 @@ def __init__(
         decorator=None,
         hardref=False,
     ):
-        self._actor_ref = weakref.ref(actor)
+        self._actor_ref = weakref.proxy(actor)
         self._method_name = method_name
         self._num_returns = num_returns
         self._generator_backpressure_num_objects = generator_backpressure_num_objects
@@ -161,6 +162,10 @@ def __call__(self, *args, **kwargs):
             f"'object.{self._method_name}.remote()'."
         )
 
+    @DeveloperAPI
+    def bind(self, *args, **kwargs):
+        return self._bind(args, kwargs)
+
     def remote(self, *args, **kwargs):
         return self._remote(args, kwargs)
 
@@ -181,8 +186,43 @@ def options(self, **options):
         class FuncWrapper:
             def remote(self, *args, **kwargs):
                 return func_cls._remote(args=args, kwargs=kwargs, **options)
+            
+            @DeveloperAPI
+            def bind(self, *args, **kwargs):
+                return func_cls._bind(args=args, kwargs=kwargs, **options)
 
         return FuncWrapper()
+    
+    @wrap_auto_init
+    @_tracing_actor_method_invocation
+    def _bind(
+        self,
+        args=None,
+        kwargs=None,
+        name="",
+        num_returns=None,
+        concurrency_group=None,
+        _generator_backpressure_num_objects=None,
+    ):
+        # TODO(sang): unify option passing
+        options = {
+            "name": name,
+            "num_returns": num_returns,
+            "concurrency_group": concurrency_group,
+            "_generator_backpressure_num_objects": _generator_backpressure_num_objects
+        }
+        other_args_to_resolve = {
+            PARENT_CLASS_NODE_KEY: self._actor_ref,
+        }
+
+        node = ClassMethodNode(
+            self._method_name,
+            args,
+            kwargs,
+            options,
+            other_args_to_resolve=other_args_to_resolve,
+        )
+        return node
 
     @wrap_auto_init
     @_tracing_actor_method_invocation
@@ -203,7 +243,7 @@ def _remote(
             )
 
         def invocation(args, kwargs):
-            actor = self._actor_hard_ref or self._actor_ref()
+            actor = self._actor_hard_ref or self._actor_ref
             if actor is None:
                 raise RuntimeError("Lost reference to actor")
             return actor._actor_method_call(
@@ -226,7 +266,7 @@ def invocation(args, kwargs):
 
     def __getstate__(self):
         return {
-            "actor": self._actor_ref(),
+            "actor": self._actor_ref,
             "method_name": self._method_name,
             "num_returns": self._num_returns,
             "decorator": self._decorator,
diff --git a/python/ray/dag/__init__.py b/python/ray/dag/__init__.py
index 985db41be0732..70c4a906393a4 100644
--- a/python/ray/dag/__init__.py
+++ b/python/ray/dag/__init__.py
@@ -9,7 +9,6 @@
 from ray.dag.output_node import OutputNode
 from ray.dag.constants import (
     PARENT_CLASS_NODE_KEY,
-    PREV_CLASS_METHOD_CALL_KEY,
     DAGNODE_TYPE_KEY,
 )
 from ray.dag.vis_utils import plot
@@ -23,7 +22,6 @@
     "InputAttributeNode",
     "DAGInputData",
     "PARENT_CLASS_NODE_KEY",
-    "PREV_CLASS_METHOD_CALL_KEY",
     "DAGNODE_TYPE_KEY",
     "plot",
     "OutputNode",
diff --git a/python/ray/dag/class_node.py b/python/ray/dag/class_node.py
index 66eb83084d214..0365682a0eeab 100644
--- a/python/ray/dag/class_node.py
+++ b/python/ray/dag/class_node.py
@@ -1,14 +1,13 @@
+from weakref import ReferenceType
+
 import ray
 from ray.dag.dag_node import DAGNode
 from ray.dag.input_node import InputNode
 from ray.dag.format_utils import get_dag_node_str
-from ray.dag.constants import (
-    PARENT_CLASS_NODE_KEY,
-    PREV_CLASS_METHOD_CALL_KEY,
-)
+from ray.dag.constants import PARENT_CLASS_NODE_KEY
 from ray.util.annotations import DeveloperAPI
 
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Union, Tuple
 
 
 @DeveloperAPI
@@ -24,7 +23,6 @@ def __init__(
         other_args_to_resolve=None,
     ):
         self._body = cls
-        self._last_call: Optional["ClassMethodNode"] = None
         super().__init__(
             cls_args,
             cls_kwargs,
@@ -85,7 +83,7 @@ def __getattr__(self, method_name: str):
             raise AttributeError(f".bind() cannot be used again on {type(self)} ")
         # Raise an error if the method is invalid.
         getattr(self._body, method_name)
-        call_node = _UnboundClassMethodNode(self, method_name)
+        call_node = _UnboundClassMethodNode(self, method_name, {})
         return call_node
 
     def __str__(self) -> str:
@@ -93,15 +91,17 @@ def __str__(self) -> str:
 
 
 class _UnboundClassMethodNode(object):
-    def __init__(self, actor: ClassNode, method_name: str):
+    def __init__(self, actor: ClassNode, method_name: str, options: dict):
+        # TODO(sang): Theoretically, We should use weakref cuz it is
+        # a circular dependency but when I used weakref, it fails 
+        # because we cannot serialize the weakref.
         self._actor = actor
         self._method_name = method_name
-        self._options = {}
+        self._options = options
 
     def bind(self, *args, **kwargs):
         other_args_to_resolve = {
             PARENT_CLASS_NODE_KEY: self._actor,
-            PREV_CLASS_METHOD_CALL_KEY: self._actor._last_call,
         }
 
         node = ClassMethodNode(
@@ -111,7 +111,6 @@ def bind(self, *args, **kwargs):
             self._options,
             other_args_to_resolve=other_args_to_resolve,
         )
-        self._actor._last_call = node
         return node
 
     def __getattr__(self, attr: str):
@@ -146,14 +145,10 @@ def __init__(
         self._bound_options = method_options or {}
         self._method_name: str = method_name
         # Parse other_args_to_resolve and assign to variables
-        self._parent_class_node: ClassNode = other_args_to_resolve.get(
-            PARENT_CLASS_NODE_KEY
-        )
-        # Used to track lineage of ClassMethodCall to preserve deterministic
-        # submission and execution order.
-        self._prev_class_method_call: Optional[
-            ClassMethodNode
-        ] = other_args_to_resolve.get(PREV_CLASS_METHOD_CALL_KEY, None)
+        self._parent_class_node: Union[
+                ClassNode,
+                ReferenceType["ray._private.actor.ActorHandle"]
+        ] = other_args_to_resolve.get(PARENT_CLASS_NODE_KEY)
         # The actor creation task dependency is encoded as the first argument,
         # and the ordering dependency as the second, which ensures they are
         # executed prior to this node.
diff --git a/python/ray/dag/constants.py b/python/ray/dag/constants.py
index d2d309d56bdaa..77ccb6cc35b78 100644
--- a/python/ray/dag/constants.py
+++ b/python/ray/dag/constants.py
@@ -1,6 +1,5 @@
 # Reserved keys used to handle ClassMethodNode in Ray DAG building.
 PARENT_CLASS_NODE_KEY = "parent_class_node"
-PREV_CLASS_METHOD_CALL_KEY = "prev_class_method_call"
 
 # Reserved key to distinguish DAGNode type and avoid collision with user dict.
 DAGNODE_TYPE_KEY = "__dag_node_type__"
diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py
index 6c20db8c15124..cd52f8da07c73 100644
--- a/python/ray/dag/dag_node.py
+++ b/python/ray/dag/dag_node.py
@@ -105,7 +105,7 @@ def clear_cache(self):
 
     def execute(
         self, *args, _ray_cache_refs: bool = False, **kwargs
-    ) -> Union[ray.ObjectRef, ray.actor.ActorHandle]:
+    ) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]:
         """Execute this DAG using the Ray default executor _execute_impl().
 
         Args:
@@ -294,7 +294,7 @@ def apply_functional(
 
         return replaced_inputs
 
-    def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, ray.actor.ActorHandle]:
+    def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]:
         """Execute this node, assuming args have been transformed already."""
         raise NotImplementedError
 
diff --git a/python/ray/dag/output_node.py b/python/ray/dag/output_node.py
index c98440a274d62..48f02371daef9 100644
--- a/python/ray/dag/output_node.py
+++ b/python/ray/dag/output_node.py
@@ -29,7 +29,7 @@ def __init__(
             other_args_to_resolve=other_args_to_resolve or {},
         )
 
-    def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, ray.actor.ActorHandle]:
+    def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]:
         if len(self._bound_args) == 1:
             return self._bound_args[0]
         else:
diff --git a/python/ray/dag/tests/test_accelerator_dag.py b/python/ray/dag/tests/test_accelerator_dag.py
index cf32dcd00852e..91d9c33a6e5b0 100644
--- a/python/ray/dag/tests/test_accelerator_dag.py
+++ b/python/ray/dag/tests/test_accelerator_dag.py
@@ -3,10 +3,7 @@
 import ray
 from ray.dag.input_node import InputNode
 from ray.dag.output_node import OutputNode
-from ray.dag import (
-    PARENT_CLASS_NODE_KEY,
-    PREV_CLASS_METHOD_CALL_KEY,
-)
+from ray.dag import PARENT_CLASS_NODE_KEY
 from ray.dag.vis_utils import plot
 
 def test_output_node(shared_ray_instance):
@@ -39,28 +36,42 @@ def f(input):
     assert ray.get(refs) == [1, 2, 1]
 
 
-def test_a(shared_ray_instance):
+def test_dag_with_actor_handle(shared_ray_instance):
+    """Verify DAG API works with actor created by .remote"""
     @ray.remote
     class Worker:
         def __init__(self):
-            pass
+            self.forward_called = 0
+            self.init_called = 0
 
         def forward(self, input):
             print("forward")
+            self.forward_called += 1
+            return input
 
         def initialize(self, input):
             print("initialize")
+            self.init_called += 1
+            return input
+        
+        def get(self):
+            return (self.forward_called, self.init_called)
 
-    worker = Worker.bind()
+    worker = Worker.remote()
     with InputNode() as input_node:
-        dag1 = worker.initialize.bind(input_node)
+        init_dag = worker.initialize.bind(input_node)
     with InputNode() as input_node:
-        dag2 = worker.forward.bind(input_node)
+        forward_dag = worker.forward.bind(input_node)
+
+    assert ray.get(init_dag.execute(1)) == 1
+    assert ray.get(forward_dag.execute(2)) == 2
 
-    print(ray.get(dag2.execute(1)))
+    # Make sure both forward/initialize called only once
+    assert ray.get(worker.get.remote()) == (1, 1)
 
-    # plot(dag1, to_file="a.png")
-    # plot(dag2, to_file="b.png")
+    # Double check the actor is resued.
+    assert ray.get(init_dag.execute(1)) == 1
+    assert ray.get(worker.get.remote()) == (1, 2)
 
 
 def test_tensor_parallel_dag(shared_ray_instance):
@@ -68,32 +79,41 @@ def test_tensor_parallel_dag(shared_ray_instance):
     class Worker:
         def __init__(self, rank):
             self.rank = rank
+            self.forwarded = 0
 
         def forward(self, input_data: int):
             print(input_data)
+            self.forwarded += 1
             return self.rank + input_data
 
         def initialize(self):
             pass
 
+        def get_forwarded(self):
+            return self.forwarded
+
+    NUM_WORKERS = 4
+    workers = [Worker.remote(i) for i in range(NUM_WORKERS)]
+    # Init multiple times.
+    for _ in range(4):
+        ray.get([worker.initialize.remote() for worker in workers])
+
     with InputNode() as input_data:
-        workers = [Worker.bind(i) for i in range(4)]
         dag = OutputNode(
             [worker.forward.bind(input_data) for worker in workers])
-        init_dag = OutputNode(
-            [worker.initialize.bind() for worker in workers])
-
-    # for _ in range(1):
-    #     refs = dag.execute(2, _ray_cache_refs=True)
-    #     assert len(refs) == 4
-    #     all_outputs = ray.get(refs)
-    #     assert all_outputs == [2, 3, 4, 5]
-
-    plot(init_dag, to_file="a.png")
-    plot(dag, to_file="b.png")
-    # ray.get(init_dag.execute(_ray_cache_refs=True))
-    import time
-    time.sleep(30)
+
+    # Run DAG repetitively.
+    ITER = 4
+    assert ITER > 1
+    for i in range(ITER):
+        ref = dag.execute(i)
+        all_outputs = ray.get(ref)
+        assert len(all_outputs) == NUM_WORKERS
+        assert all_outputs == [i + j for j in range(NUM_WORKERS)]
+
+    forwarded = ray.get(
+        [worker.get_forwarded.remote() for worker in workers])
+    assert forwarded == [ITER for _ in range(NUM_WORKERS)]
 
 
 if __name__ == "__main__":
diff --git a/python/ray/dag/tests/test_class_dag.py b/python/ray/dag/tests/test_class_dag.py
index 8bef8c792f9f9..f500d7774f02b 100644
--- a/python/ray/dag/tests/test_class_dag.py
+++ b/python/ray/dag/tests/test_class_dag.py
@@ -1,10 +1,7 @@
 import pytest
 
 import ray
-from ray.dag import (
-    PARENT_CLASS_NODE_KEY,
-    PREV_CLASS_METHOD_CALL_KEY,
-)
+from ray.dag import PARENT_CLASS_NODE_KEY
 
 
 @ray.remote
@@ -150,13 +147,6 @@ def combine(x, y):
         .get("name")
         == "a2_v0"
     )
-    # refer to actor method a2.inc.options() call
-    assert (
-        test_a2.get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY]
-        .get_options()
-        .get("name")
-        == "v3"
-    )
     # refer to a1 constructor .options() call
     assert (
         test_a1.get_other_args_to_resolve()[PARENT_CLASS_NODE_KEY]
@@ -164,21 +154,6 @@ def combine(x, y):
         .get("name")
         == "a1_v1"
     )
-    # refer to latest actor method a1.inc.options() call
-    assert (
-        test_a1.get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY]
-        .get_options()
-        .get("name")
-        == "v2"
-    )
-    # refer to first bound actor method a1.inc.options() call
-    assert (
-        test_a1.get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY]
-        .get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY]
-        .get_options()
-        .get("name")
-        == "v1"
-    )
 
 
 def test_pass_actor_handle(shared_ray_instance):
diff --git a/python/ray/dag/utils.py b/python/ray/dag/utils.py
index 3da8570027dbd..6bb59571b262a 100644
--- a/python/ray/dag/utils.py
+++ b/python/ray/dag/utils.py
@@ -23,9 +23,9 @@ def __init__(self):
     def get_node_name(self, node: DAGNode):
         # InputNode should be unique.
         if isinstance(node, InputNode):
-            return "OUTPUT_NODE"
-        if isinstance(node, OutputNode):
             return "INPUT_NODE"
+        if isinstance(node, OutputNode):
+            return "OUTPUT_NODE"
         # InputAttributeNode suffixes should match the user-defined key.
         elif isinstance(node, InputAttributeNode):
             return f"INPUT_ATTRIBUTE_NODE_{node._key}"

From 664b07a4aefdb2a6e4b0822295a6592a8114e2bd Mon Sep 17 00:00:00 2001
From: SangBin Cho <sangcho@sangcho-LT93GQWG9C.local>
Date: Fri, 17 Nov 2023 23:35:42 +0900
Subject: [PATCH 03/66] enhancement

---
 a.py                                         | 30 -----------------
 python/ray/actor.py                          |  8 ++---
 python/ray/dag/class_node.py                 |  5 ++-
 python/ray/dag/dag_node.py                   |  4 ++-
 python/ray/dag/input_node.py                 | 22 ++++++++++++-
 python/ray/dag/output_node.py                | 18 +++++------
 python/ray/dag/tests/test_accelerator_dag.py | 34 +++++++++++---------
 python/ray/serve/tests/common/test_dags.py   |  1 -
 8 files changed, 57 insertions(+), 65 deletions(-)
 delete mode 100644 a.py

diff --git a/a.py b/a.py
deleted file mode 100644
index fe346dda46c8a..0000000000000
--- a/a.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import ray
-from ray.dag.vis_utils import plot
-ray.init()
-
-from ray.dag.input_node import InputNode
-
-@ray.remote
-def a(user_input):
-    return user_input * 2
-
-@ray.remote
-def b(user_input):
-    return user_input + 1
-
-@ray.remote
-def c(x, y):
-    return x + y
-
-with InputNode() as dag_input:
-    a_ref = a.bind(dag_input)
-    b_ref = b.bind(dag_input)
-    dag = c.bind(a_ref, b_ref)
-
-#   a(2)  +   b(2)  = c
-# (2 * 2) + (2 * 1)
-assert ray.get(dag.execute(2)) == 7
-
-#   a(3)  +   b(3)  = c
-# (3 * 2) + (3 * 1)
-assert ray.get(dag.execute(3)) == 10
diff --git a/python/ray/actor.py b/python/ray/actor.py
index 3019cbe47ca53..d2e2877349a0b 100644
--- a/python/ray/actor.py
+++ b/python/ray/actor.py
@@ -7,7 +7,6 @@
 import ray._private.signature as signature
 import ray._private.worker
 import ray._raylet
-from ray.dag.class_node import ClassMethodNode, PARENT_CLASS_NODE_KEY
 from ray import ActorClassID, Language, cross_language
 from ray._private import ray_option_utils
 from ray._private.async_compat import is_async_func
@@ -30,6 +29,7 @@
     StreamingObjectRefGenerator,
     raise_sys_exit_with_custom_error_message,
 )
+from ray.dag.class_node import PARENT_CLASS_NODE_KEY, ClassMethodNode
 from ray.exceptions import AsyncioActorExit
 from ray.util.annotations import DeveloperAPI, PublicAPI
 from ray.util.placement_group import _configure_placement_group_based_on_context
@@ -186,13 +186,13 @@ def options(self, **options):
         class FuncWrapper:
             def remote(self, *args, **kwargs):
                 return func_cls._remote(args=args, kwargs=kwargs, **options)
-            
+
             @DeveloperAPI
             def bind(self, *args, **kwargs):
                 return func_cls._bind(args=args, kwargs=kwargs, **options)
 
         return FuncWrapper()
-    
+
     @wrap_auto_init
     @_tracing_actor_method_invocation
     def _bind(
@@ -209,7 +209,7 @@ def _bind(
             "name": name,
             "num_returns": num_returns,
             "concurrency_group": concurrency_group,
-            "_generator_backpressure_num_objects": _generator_backpressure_num_objects
+            "_generator_backpressure_num_objects": _generator_backpressure_num_objects,
         }
         other_args_to_resolve = {
             PARENT_CLASS_NODE_KEY: self._actor_ref,
diff --git a/python/ray/dag/class_node.py b/python/ray/dag/class_node.py
index 0365682a0eeab..a474ffa10c553 100644
--- a/python/ray/dag/class_node.py
+++ b/python/ray/dag/class_node.py
@@ -93,7 +93,7 @@ def __str__(self) -> str:
 class _UnboundClassMethodNode(object):
     def __init__(self, actor: ClassNode, method_name: str, options: dict):
         # TODO(sang): Theoretically, We should use weakref cuz it is
-        # a circular dependency but when I used weakref, it fails 
+        # a circular dependency but when I used weakref, it fails
         # because we cannot serialize the weakref.
         self._actor = actor
         self._method_name = method_name
@@ -146,8 +146,7 @@ def __init__(
         self._method_name: str = method_name
         # Parse other_args_to_resolve and assign to variables
         self._parent_class_node: Union[
-                ClassNode,
-                ReferenceType["ray._private.actor.ActorHandle"]
+            ClassNode, ReferenceType["ray._private.actor.ActorHandle"]
         ] = other_args_to_resolve.get(PARENT_CLASS_NODE_KEY)
         # The actor creation task dependency is encoded as the first argument,
         # and the ordering dependency as the second, which ensures they are
diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py
index cd52f8da07c73..6041a12401855 100644
--- a/python/ray/dag/dag_node.py
+++ b/python/ray/dag/dag_node.py
@@ -294,7 +294,9 @@ def apply_functional(
 
         return replaced_inputs
 
-    def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]:
+    def _execute_impl(
+        self, *args, **kwargs
+    ) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]:
         """Execute this node, assuming args have been transformed already."""
         raise NotImplementedError
 
diff --git a/python/ray/dag/input_node.py b/python/ray/dag/input_node.py
index 3cffc6891c775..d80786a11c036 100644
--- a/python/ray/dag/input_node.py
+++ b/python/ray/dag/input_node.py
@@ -92,6 +92,8 @@ def __init__(
         """
         if len(args) != 0 or len(kwargs) != 0:
             raise ValueError("InputNode should not take any args or kwargs.")
+        self._args = args
+        self._kwargs = kwargs
 
         self.input_attribute_nodes = {}
 
@@ -103,6 +105,14 @@ def __init__(
 
         super().__init__([], {}, {}, other_args_to_resolve=_other_args_to_resolve)
 
+    @property
+    def args(self) -> List[Any]:
+        return self._args
+
+    @property
+    def kwargs(self) -> Dict[Any, Any]:
+        return self._kwargs
+
     def _copy_impl(
         self,
         new_args: List[Any],
@@ -110,7 +120,9 @@ def _copy_impl(
         new_options: Dict[str, Any],
         new_other_args_to_resolve: Dict[str, Any],
     ):
-        return InputNode(_other_args_to_resolve=new_other_args_to_resolve)
+        return InputNode(
+            *new_args, _other_args_to_resolve=new_other_args_to_resolve, **new_kwargs
+        )
 
     def _execute_impl(self, *args, **kwargs):
         """Executor of InputNode."""
@@ -321,6 +333,14 @@ def __init__(self, *args, **kwargs):
         self._args = list(args)
         self._kwargs = kwargs
 
+    @property
+    def args(self) -> List[Any]:
+        return self._args
+
+    @property
+    def kwargs(self) -> Dict[Any, Any]:
+        return self._kwargs
+
     def __getitem__(self, key: Union[int, str]) -> Any:
         if isinstance(key, int):
             # Access list args by index.
diff --git a/python/ray/dag/output_node.py b/python/ray/dag/output_node.py
index 48f02371daef9..c37e9b0de6954 100644
--- a/python/ray/dag/output_node.py
+++ b/python/ray/dag/output_node.py
@@ -10,16 +10,17 @@
 
 
 class OutputNode(DAGNode):
-    r"""Ray dag node used in DAG building API to mark the endpoint of DAG
-    """
+    r"""Ray dag node used in DAG building API to mark the endpoint of DAG"""
 
     def __init__(
         self,
-        args: Union[DAGNode, List[DAGNode], Tuple[DAGNode]],
+        args: Union[List[DAGNode], Tuple[DAGNode]],
         other_args_to_resolve: Dict[str, Any] = None,
     ):
         if isinstance(args, tuple):
             args = list(args)
+        if not isinstance(args, list):
+            raise ValueError(f"Invalid input type for `args`, {type(args)}.")
         if not isinstance(args, list):
             args = (args,)
         super().__init__(
@@ -29,11 +30,10 @@ def __init__(
             other_args_to_resolve=other_args_to_resolve or {},
         )
 
-    def _execute_impl(self, *args, **kwargs) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]:
-        if len(self._bound_args) == 1:
-            return self._bound_args[0]
-        else:
-            return self._bound_args
+    def _execute_impl(
+        self, *args, **kwargs
+    ) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]:
+        return self._bound_args
 
     def _copy_impl(
         self,
@@ -44,6 +44,6 @@ def _copy_impl(
     ) -> "DAGNode":
         """Return a copy of this node with the given new args."""
         return OutputNode(new_args, new_other_args_to_resolve)
-    
+
     def __str__(self) -> str:
         return get_dag_node_str(self, "__OutputNode__")
diff --git a/python/ray/dag/tests/test_accelerator_dag.py b/python/ray/dag/tests/test_accelerator_dag.py
index 91d9c33a6e5b0..5bad4e9781380 100644
--- a/python/ray/dag/tests/test_accelerator_dag.py
+++ b/python/ray/dag/tests/test_accelerator_dag.py
@@ -6,31 +6,34 @@
 from ray.dag import PARENT_CLASS_NODE_KEY
 from ray.dag.vis_utils import plot
 
+
 def test_output_node(shared_ray_instance):
     @ray.remote
     def f(input):
         return input
 
+    with pytest.raises(ValueError):
+        with InputNode() as input_data:
+            dag = OutputNode(f.bind(input_data))
+
     with InputNode() as input_data:
-        dag = OutputNode(f.bind(input_data))
-    
-    assert ray.get(dag.execute(1)) == 1
-    assert ray.get(dag.execute(2)) == 2
+        dag = OutputNode([f.bind(input_data)])
+
+    assert ray.get(dag.execute(1)) == [1]
+    assert ray.get(dag.execute(2)) == [2]
 
     with InputNode() as input_data:
         dag = OutputNode([f.bind(input_data["x"]), f.bind(input_data["y"])])
-    
+
     refs = dag.execute({"x": 1, "y": 2})
     assert len(refs) == 2
     assert ray.get(refs) == [1, 2]
 
     with InputNode() as input_data:
-        dag = OutputNode([
-            f.bind(input_data["x"]),
-            f.bind(input_data["y"]),
-            f.bind(input_data["x"])
-        ])
-    
+        dag = OutputNode(
+            [f.bind(input_data["x"]), f.bind(input_data["y"]), f.bind(input_data["x"])]
+        )
+
     refs = dag.execute({"x": 1, "y": 2})
     assert len(refs) == 3
     assert ray.get(refs) == [1, 2, 1]
@@ -38,6 +41,7 @@ def f(input):
 
 def test_dag_with_actor_handle(shared_ray_instance):
     """Verify DAG API works with actor created by .remote"""
+
     @ray.remote
     class Worker:
         def __init__(self):
@@ -53,7 +57,7 @@ def initialize(self, input):
             print("initialize")
             self.init_called += 1
             return input
-        
+
         def get(self):
             return (self.forward_called, self.init_called)
 
@@ -99,8 +103,7 @@ def get_forwarded(self):
         ray.get([worker.initialize.remote() for worker in workers])
 
     with InputNode() as input_data:
-        dag = OutputNode(
-            [worker.forward.bind(input_data) for worker in workers])
+        dag = OutputNode([worker.forward.bind(input_data) for worker in workers])
 
     # Run DAG repetitively.
     ITER = 4
@@ -111,8 +114,7 @@ def get_forwarded(self):
         assert len(all_outputs) == NUM_WORKERS
         assert all_outputs == [i + j for j in range(NUM_WORKERS)]
 
-    forwarded = ray.get(
-        [worker.get_forwarded.remote() for worker in workers])
+    forwarded = ray.get([worker.get_forwarded.remote() for worker in workers])
     assert forwarded == [ITER for _ in range(NUM_WORKERS)]
 
 
diff --git a/python/ray/serve/tests/common/test_dags.py b/python/ray/serve/tests/common/test_dags.py
index 8bd7bcbc78912..ddccf41c5e3a9 100644
--- a/python/ray/serve/tests/common/test_dags.py
+++ b/python/ray/serve/tests/common/test_dags.py
@@ -60,4 +60,3 @@ def get_multi_instantiation_class_nested_deployment_arg_dag():
         ray_dag = combine.__call__.bind(dag_input)
 
     return ray_dag, dag_input
-

From 8f6f8d276ae3eb8532c9dfa17f1afd6b9d7838a4 Mon Sep 17 00:00:00 2001
From: SangBin Cho <sangcho@sangcho-LT93GQWG9C.local>
Date: Fri, 17 Nov 2023 23:38:00 +0900
Subject: [PATCH 04/66] working now.

---
 SANGREADME.md                                | 65 --------------------
 python/ray/dag/input_node.py                 | 22 +------
 python/ray/dag/output_node.py                |  2 -
 python/ray/dag/tests/test_accelerator_dag.py |  2 -
 4 files changed, 1 insertion(+), 90 deletions(-)
 delete mode 100644 SANGREADME.md

diff --git a/SANGREADME.md b/SANGREADME.md
deleted file mode 100644
index 4f3a740a0ea72..0000000000000
--- a/SANGREADME.md
+++ /dev/null
@@ -1,65 +0,0 @@
-Actor.bind would kill actors unless I cache the refs. We should fix it.
-When actor calls are binded with actor.method.bind, it doesn't create a new DAG, but it append binded methods to existing DAG. 
-
-Worker -> method1
-       -> method 2
-
-Instead of 2 dags with
-
-method1
-method 2
-
-Only 1 input node is possible with current DAG API.
-
-Serve: Got around the first issue because all actors are detached.
-Not sure how it got around the second case. Maybe it never need to handle this case. 
-
-Example:
-
-worker = Worker.bind()
-dag = worker.method.bind()
-dag2 = worker.method_2.bind()
-
-This will become
-
-worker -> method -> method2
-
-not 
-
-worker -> method
-worker -> method_2
-
-
-VLLM
-
-init_worker
-init torch distributed
-init_model
-profile_num_available_blocks
-init_cache_engine
-
-forward
-
-Q:
-- How much existing DAG will be used? Are we going to implement our own DAG APIs? (I believe so?)
-- What's the work needed to make .remote work with actors?
-    - Is actor creation supposed to be a part of DAG?
-- How the current shared memory based transport feature will be exposed to API?
-- How do we handle different size input for different object ref? (the remaining bytes are just becoming garbages?)
-- e2e flow
-    - InputNode creates the first buffer (object_ref) that could be reused.
-    - Each bind method reuses the buffer.
-    - If actor is reused.
-        - Use the first buffer created? We can only have 1 input node anyway now.
-- Iterable DAG -> is it just a repeat of execute?
-
-TODO
-- [done] Curerntly, any bind from actor will become a huge single DAG starting from actor. 
-    - Need to find a way to exclude ClassNode from DAG execution. 
-- [done] Only one input node is possible for a single actor. But input node can have multiple inputs
-    - Maybe we should allow multiple input node for a single actor (and use it as a starting point).
-- [done] No way to keep the actor alive.
-    - There's private argument _ray_cache_ref, but it will cache all refs which is not desirable.
-    - New API in the part of bind.
-
-1 DAG can only have 1 input Node
diff --git a/python/ray/dag/input_node.py b/python/ray/dag/input_node.py
index d80786a11c036..3cffc6891c775 100644
--- a/python/ray/dag/input_node.py
+++ b/python/ray/dag/input_node.py
@@ -92,8 +92,6 @@ def __init__(
         """
         if len(args) != 0 or len(kwargs) != 0:
             raise ValueError("InputNode should not take any args or kwargs.")
-        self._args = args
-        self._kwargs = kwargs
 
         self.input_attribute_nodes = {}
 
@@ -105,14 +103,6 @@ def __init__(
 
         super().__init__([], {}, {}, other_args_to_resolve=_other_args_to_resolve)
 
-    @property
-    def args(self) -> List[Any]:
-        return self._args
-
-    @property
-    def kwargs(self) -> Dict[Any, Any]:
-        return self._kwargs
-
     def _copy_impl(
         self,
         new_args: List[Any],
@@ -120,9 +110,7 @@ def _copy_impl(
         new_options: Dict[str, Any],
         new_other_args_to_resolve: Dict[str, Any],
     ):
-        return InputNode(
-            *new_args, _other_args_to_resolve=new_other_args_to_resolve, **new_kwargs
-        )
+        return InputNode(_other_args_to_resolve=new_other_args_to_resolve)
 
     def _execute_impl(self, *args, **kwargs):
         """Executor of InputNode."""
@@ -333,14 +321,6 @@ def __init__(self, *args, **kwargs):
         self._args = list(args)
         self._kwargs = kwargs
 
-    @property
-    def args(self) -> List[Any]:
-        return self._args
-
-    @property
-    def kwargs(self) -> Dict[Any, Any]:
-        return self._kwargs
-
     def __getitem__(self, key: Union[int, str]) -> Any:
         if isinstance(key, int):
             # Access list args by index.
diff --git a/python/ray/dag/output_node.py b/python/ray/dag/output_node.py
index c37e9b0de6954..d2749cbc1bb0f 100644
--- a/python/ray/dag/output_node.py
+++ b/python/ray/dag/output_node.py
@@ -3,8 +3,6 @@
 
 from ray.dag import DAGNode
 from ray.dag.format_utils import get_dag_node_str
-from ray.experimental.gradio_utils import type_to_string
-from ray.util.annotations import Deprecated
 
 IN_CONTEXT_MANAGER = "__in_context_manager__"
 
diff --git a/python/ray/dag/tests/test_accelerator_dag.py b/python/ray/dag/tests/test_accelerator_dag.py
index 5bad4e9781380..7114a4f0f0ac7 100644
--- a/python/ray/dag/tests/test_accelerator_dag.py
+++ b/python/ray/dag/tests/test_accelerator_dag.py
@@ -3,8 +3,6 @@
 import ray
 from ray.dag.input_node import InputNode
 from ray.dag.output_node import OutputNode
-from ray.dag import PARENT_CLASS_NODE_KEY
-from ray.dag.vis_utils import plot
 
 
 def test_output_node(shared_ray_instance):

From 12b977dd923e4a0b2f16817631f200656e51849b Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 28 Nov 2023 22:08:43 -0800
Subject: [PATCH 05/66] initial commit

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 BUILD.bazel                                   |   4 +
 python/ray/__init__.py                        |   1 +
 python/ray/_private/worker.py                 |  22 +-
 python/ray/_raylet.pyx                        |  20 +-
 python/ray/includes/libcoreworker.pxd         |   7 +-
 python/ray/tests/test_accelerated_dag.py      |  27 +++
 src/ray/core_worker/core_worker.cc            |  17 +-
 src/ray/core_worker/core_worker.h             |   8 +-
 .../store_provider/plasma_store_provider.cc   |  22 +-
 .../store_provider/plasma_store_provider.h    |   4 +-
 src/ray/object_manager/common.cc              | 155 ++++++++++++++
 src/ray/object_manager/common.h               |  72 ++++++-
 src/ray/object_manager/plasma/client.cc       | 194 +++++++++++++++---
 src/ray/object_manager/plasma/client.h        |   8 +-
 src/ray/object_manager/plasma/common.h        |  11 +-
 src/ray/object_manager/plasma/object_store.cc |   8 +
 src/ray/object_manager/plasma/plasma.fbs      |   2 +
 src/ray/object_manager/plasma/plasma.h        |   3 +
 .../object_manager/plasma/plasma_allocator.cc |   2 +-
 src/ray/object_manager/plasma/protocol.cc     |   4 +
 src/ray/object_manager/plasma/store.cc        |  56 +++++
 src/ray/object_manager/plasma/store.h         |   2 +
 22 files changed, 587 insertions(+), 62 deletions(-)
 create mode 100644 python/ray/tests/test_accelerated_dag.py
 create mode 100644 src/ray/object_manager/common.cc

diff --git a/BUILD.bazel b/BUILD.bazel
index 48eac971a76b0..1f8ff15b53798 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -321,6 +321,7 @@ PLASMA_LINKOPTS = [] + select({
 ray_cc_library(
     name = "plasma_client",
     srcs = [
+        "src/ray/object_manager/common.cc",
         "src/ray/object_manager/plasma/client.cc",
         "src/ray/object_manager/plasma/connection.cc",
         "src/ray/object_manager/plasma/malloc.cc",
@@ -401,6 +402,9 @@ ray_cc_library(
         ":plasma_client",
         "//src/ray/common:network",
         ":stats_lib",
+        "@boost//:asio",
+        "@boost//:context",
+        "@boost//:coroutine",
     ],
 )
 
diff --git a/python/ray/__init__.py b/python/ray/__init__.py
index e74749ab6e8fa..031f8054cf8e7 100644
--- a/python/ray/__init__.py
+++ b/python/ray/__init__.py
@@ -118,6 +118,7 @@ def _configure_system():
     get,
     get_actor,
     get_gpu_ids,
+    release,
     init,
     is_initialized,
     put,
diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py
index 1d3324fa58fe8..94ab25be014b0 100644
--- a/python/ray/_private/worker.py
+++ b/python/ray/_private/worker.py
@@ -688,7 +688,7 @@ def set_mode(self, mode):
     def set_load_code_from_local(self, load_code_from_local):
         self._load_code_from_local = load_code_from_local
 
-    def put_object(self, value, object_ref=None, owner_address=None):
+    def put_object(self, value, object_ref=None, owner_address=None, max_readers=-1):
         """Put value in the local object store with object reference `object_ref`.
 
         This assumes that the value for `object_ref` has not yet been placed in
@@ -744,7 +744,10 @@ def put_object(self, value, object_ref=None, owner_address=None):
         # reference counter.
         return ray.ObjectRef(
             self.core_worker.put_serialized_object_and_increment_local_ref(
-                serialized_value, object_ref=object_ref, owner_address=owner_address
+                serialized_value,
+                object_ref=object_ref,
+                owner_address=owner_address,
+                max_readers=max_readers,
             ),
             # The initial local reference is already acquired internally.
             skip_adding_local_ref=True,
@@ -2489,6 +2492,12 @@ def show_in_dashboard(message: str, key: str = "", dtype: str = "text"):
 blocking_get_inside_async_warned = False
 
 
+def release(object_ref):
+    worker = global_worker
+    worker.check_connected()
+    worker.core_worker.get_release([object_ref])
+
+
 @overload
 def get(
     object_refs: "Sequence[ObjectRef[Any]]", *, timeout: Optional[float] = None
@@ -2623,7 +2632,10 @@ def get(
 @PublicAPI
 @client_mode_hook
 def put(
-    value: Any, *, _owner: Optional["ray.actor.ActorHandle"] = None
+    value: Any,
+    *,
+    _owner: Optional["ray.actor.ActorHandle"] = None,
+    max_readers=-1,
 ) -> "ray.ObjectRef":
     """Store an object in the object store.
 
@@ -2669,7 +2681,9 @@ def put(
 
     with profiling.profile("ray.put"):
         try:
-            object_ref = worker.put_object(value, owner_address=serialize_owner_address)
+            object_ref = worker.put_object(
+                value, owner_address=serialize_owner_address, max_readers=max_readers
+            )
         except ObjectStoreFullError:
             logger.info(
                 "Put failed since the value was either too large or the "
diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index 8e30b4bd9907e..e55a29dd08226 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -3336,6 +3336,13 @@ cdef class CoreWorker:
 
         return RayObjectsToDataMetadataPairs(results)
 
+    def get_release(self, object_refs):
+        cdef:
+            c_vector[CObjectID] c_object_ids = ObjectRefsToVector(object_refs)
+        with nogil:
+            op_status = CCoreWorkerProcess.GetCoreWorker().GetRelease(c_object_ids)
+        check_status(op_status)
+
     def get_if_local(self, object_refs):
         """Get objects from local plasma store directly
         without a fetch request to raylet."""
@@ -3463,13 +3470,15 @@ cdef class CoreWorker:
                 CCoreWorkerProcess.GetCoreWorker().SealExisting(
                             c_object_id, pin_object=False,
                             generator_id=CObjectID.Nil(),
-                            owner_address=c_owner_address))
+                            owner_address=c_owner_address,
+                            max_readers=-1))
 
     def put_serialized_object_and_increment_local_ref(self, serialized_object,
                                                       ObjectRef object_ref=None,
                                                       c_bool pin_object=True,
                                                       owner_address=None,
-                                                      c_bool inline_small_object=True):
+                                                      c_bool inline_small_object=True,
+                                                      max_readers=-1):
         cdef:
             CObjectID c_object_id
             shared_ptr[CBuffer] data
@@ -3477,6 +3486,7 @@ cdef class CoreWorker:
             unique_ptr[CAddress] c_owner_address
             c_vector[CObjectID] contained_object_ids
             c_vector[CObjectReference] contained_object_refs
+            int64_t c_max_readers = max_readers
 
         metadata = string_to_buffer(serialized_object.metadata)
         total_bytes = serialized_object.total_bytes
@@ -3514,7 +3524,8 @@ cdef class CoreWorker:
                             CCoreWorkerProcess.GetCoreWorker().SealOwned(
                                         c_object_id,
                                         pin_object,
-                                        move(c_owner_address)))
+                                        move(c_owner_address),
+                                        c_max_readers))
                     else:
                         # Using custom object refs is not supported because we
                         # can't track their lifecycle, so we don't pin the
@@ -3523,7 +3534,8 @@ cdef class CoreWorker:
                             CCoreWorkerProcess.GetCoreWorker().SealExisting(
                                         c_object_id, pin_object=False,
                                         generator_id=CObjectID.Nil(),
-                                        owner_address=move(c_owner_address)))
+                                        owner_address=move(c_owner_address),
+                                        max_readers=c_max_readers))
 
         return c_object_id.Binary()
 
diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd
index 0f9d158cca352..28fa6375212bb 100644
--- a/python/ray/includes/libcoreworker.pxd
+++ b/python/ray/includes/libcoreworker.pxd
@@ -240,10 +240,13 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
                                   shared_ptr[CBuffer] *data,
                                   c_bool created_by_worker)
         CRayStatus SealOwned(const CObjectID &object_id, c_bool pin_object,
-                             const unique_ptr[CAddress] &owner_address)
+                             const unique_ptr[CAddress] &owner_address,
+                             int64_t max_readers)
         CRayStatus SealExisting(const CObjectID &object_id, c_bool pin_object,
                                 const CObjectID &generator_id,
-                                const unique_ptr[CAddress] &owner_address)
+                                const unique_ptr[CAddress] &owner_address,
+                                int64_t max_readers)
+        CRayStatus GetRelease(const c_vector[CObjectID] &object_ids)
         CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms,
                        c_vector[shared_ptr[CRayObject]] *results)
         CRayStatus GetIfLocal(
diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py
new file mode 100644
index 0000000000000..8f6286a3c5351
--- /dev/null
+++ b/python/ray/tests/test_accelerated_dag.py
@@ -0,0 +1,27 @@
+# coding: utf-8
+import logging
+import os
+import sys
+
+import pytest
+
+import ray
+import ray.cluster_utils
+
+logger = logging.getLogger(__name__)
+
+
+def test_put_mutable_object(ray_start_cluster):
+    # ref = ray.create_mutable_object(size_bytes=1000)
+
+    max_readers = 1
+    arr = b"binary"
+    ref = ray.put(arr, max_readers=max_readers)
+    ray.release(ref)
+
+
+if __name__ == "__main__":
+    if os.environ.get("PARALLEL_CI"):
+        sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))
+    else:
+        sys.exit(pytest.main(["-sv", __file__]))
diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc
index 5042ce3dc164f..586b69714eb35 100644
--- a/src/ray/core_worker/core_worker.cc
+++ b/src/ray/core_worker/core_worker.cc
@@ -1326,9 +1326,10 @@ Status CoreWorker::CreateExisting(const std::shared_ptr<Buffer> &metadata,
 
 Status CoreWorker::SealOwned(const ObjectID &object_id,
                              bool pin_object,
-                             const std::unique_ptr<rpc::Address> &owner_address) {
-  auto status =
-      SealExisting(object_id, pin_object, ObjectID::Nil(), std::move(owner_address));
+                             const std::unique_ptr<rpc::Address> &owner_address,
+                             int64_t max_readers) {
+  auto status = SealExisting(
+      object_id, pin_object, ObjectID::Nil(), std::move(owner_address), max_readers);
   if (status.ok()) return status;
   RemoveLocalReference(object_id);
   if (reference_counter_->HasReference(object_id)) {
@@ -1342,8 +1343,9 @@ Status CoreWorker::SealOwned(const ObjectID &object_id,
 Status CoreWorker::SealExisting(const ObjectID &object_id,
                                 bool pin_object,
                                 const ObjectID &generator_id,
-                                const std::unique_ptr<rpc::Address> &owner_address) {
-  RAY_RETURN_NOT_OK(plasma_store_provider_->Seal(object_id));
+                                const std::unique_ptr<rpc::Address> &owner_address,
+                                int64_t max_readers) {
+  RAY_RETURN_NOT_OK(plasma_store_provider_->Seal(object_id, max_readers));
   if (pin_object) {
     // Tell the raylet to pin the object **after** it is created.
     RAY_LOG(DEBUG) << "Pinning sealed object " << object_id;
@@ -1367,6 +1369,11 @@ Status CoreWorker::SealExisting(const ObjectID &object_id,
   return Status::OK();
 }
 
+Status CoreWorker::GetRelease(const std::vector<ObjectID> &object_ids) {
+  RAY_CHECK(object_ids.size() == 1);
+  return plasma_store_provider_->GetRelease(object_ids[0]);
+}
+
 Status CoreWorker::Get(const std::vector<ObjectID> &ids,
                        const int64_t timeout_ms,
                        std::vector<std::shared_ptr<RayObject>> *results) {
diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h
index e1668b61b6f51..d4f621acd6380 100644
--- a/src/ray/core_worker/core_worker.h
+++ b/src/ray/core_worker/core_worker.h
@@ -656,7 +656,8 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
   /// \return Status.
   Status SealOwned(const ObjectID &object_id,
                    bool pin_object,
-                   const std::unique_ptr<rpc::Address> &owner_address = nullptr);
+                   const std::unique_ptr<rpc::Address> &owner_address = nullptr,
+                   int64_t max_readers = -1);
 
   /// Finalize placing an object into the object store. This should be called after
   /// a corresponding `CreateExisting()` call and then writing into the returned buffer.
@@ -673,7 +674,10 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
   Status SealExisting(const ObjectID &object_id,
                       bool pin_object,
                       const ObjectID &generator_id = ObjectID::Nil(),
-                      const std::unique_ptr<rpc::Address> &owner_address = nullptr);
+                      const std::unique_ptr<rpc::Address> &owner_address = nullptr,
+                      int64_t max_readers = -1);
+
+  Status GetRelease(const std::vector<ObjectID> &object_ids);
 
   /// Get a list of objects from the object store. Objects that failed to be retrieved
   /// will be returned as nullptrs.
diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.cc b/src/ray/core_worker/store_provider/plasma_store_provider.cc
index 827678edc0f85..eb440a668a9dd 100644
--- a/src/ray/core_worker/store_provider/plasma_store_provider.cc
+++ b/src/ray/core_worker/store_provider/plasma_store_provider.cc
@@ -153,8 +153,9 @@ Status CoreWorkerPlasmaStoreProvider::Create(const std::shared_ptr<Buffer> &meta
   return status;
 }
 
-Status CoreWorkerPlasmaStoreProvider::Seal(const ObjectID &object_id) {
-  return store_client_.Seal(object_id);
+Status CoreWorkerPlasmaStoreProvider::Seal(const ObjectID &object_id,
+                                           int64_t max_readers) {
+  return store_client_.Seal(object_id, max_readers);
 }
 
 Status CoreWorkerPlasmaStoreProvider::Release(const ObjectID &object_id) {
@@ -171,12 +172,13 @@ Status CoreWorkerPlasmaStoreProvider::FetchAndGetFromPlasmaStore(
     absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
     bool *got_exception) {
   const auto owner_addresses = reference_counter_->GetOwnerAddresses(batch_ids);
-  RAY_RETURN_NOT_OK(
-      raylet_client_->FetchOrReconstruct(batch_ids,
-                                         owner_addresses,
-                                         fetch_only,
-                                         /*mark_worker_blocked*/ !in_direct_call,
-                                         task_id));
+  // TODO this IPC needs to be skipped in shared mode
+  //  RAY_RETURN_NOT_OK(
+  //      raylet_client_->FetchOrReconstruct(batch_ids,
+  //                                         owner_addresses,
+  //                                         fetch_only,
+  //                                         /*mark_worker_blocked*/ !in_direct_call,
+  //                                         task_id));
 
   std::vector<plasma::ObjectBuffer> plasma_results;
   RAY_RETURN_NOT_OK(store_client_.Get(batch_ids,
@@ -215,6 +217,10 @@ Status CoreWorkerPlasmaStoreProvider::FetchAndGetFromPlasmaStore(
   return Status::OK();
 }
 
+Status CoreWorkerPlasmaStoreProvider::GetRelease(const ObjectID &object_id) {
+  return store_client_.GetRelease(object_id);
+}
+
 Status CoreWorkerPlasmaStoreProvider::GetIfLocal(
     const std::vector<ObjectID> &object_ids,
     absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results) {
diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.h b/src/ray/core_worker/store_provider/plasma_store_provider.h
index 2e08309c6cc88..523aa86a3e5f0 100644
--- a/src/ray/core_worker/store_provider/plasma_store_provider.h
+++ b/src/ray/core_worker/store_provider/plasma_store_provider.h
@@ -135,7 +135,7 @@ class CoreWorkerPlasmaStoreProvider {
   ///
   /// \param[in] object_id The ID of the object. This can be used as an
   /// argument to Get to retrieve the object data.
-  Status Seal(const ObjectID &object_id);
+  Status Seal(const ObjectID &object_id, int64_t max_readers = -1);
 
   /// Release the first reference to the object created by Put() or Create(). This should
   /// be called exactly once per object and until it is called, the object is pinned and
@@ -151,6 +151,8 @@ class CoreWorkerPlasmaStoreProvider {
              absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
              bool *got_exception);
 
+  Status GetRelease(const ObjectID &object_id);
+
   /// Get objects directly from the local plasma store, without waiting for the
   /// objects to be fetched from another node. This should only be used
   /// internally, never by user code.
diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc
new file mode 100644
index 0000000000000..4eff0d3e583b4
--- /dev/null
+++ b/src/ray/object_manager/common.cc
@@ -0,0 +1,155 @@
+#include "ray/object_manager/common.h"
+
+namespace ray {
+
+void PrintPlasmaObjectHeader(const PlasmaObjectHeader *header) {
+  RAY_LOG(DEBUG) << "PlasmaObjectHeader: \n"
+                 << "version: " << header->version << "\n"
+                 << "num_readers: " << header->num_readers << "\n"
+                 << "num_read_acquires_remaining: " << header->num_read_acquires_remaining
+                 << "\n"
+                 << "num_read_releases_remaining: " << header->num_read_releases_remaining
+                 << "\n"
+                 << "data_size: " << header->data_size << "\n";
+}
+
+void PlasmaObjectHeader::Init() {
+  // wr_mut is shared between writer and readers.
+  pthread_mutexattr_t mutex_attr;
+  pthread_mutexattr_init(&mutex_attr);
+  pthread_mutexattr_setpshared(&mutex_attr, PTHREAD_PROCESS_SHARED);
+  pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_ERRORCHECK);
+  pthread_mutex_init(&wr_mut, &mutex_attr);
+
+  sem_init(&rw_semaphore, PTHREAD_PROCESS_SHARED, 1);
+
+  // Condition is shared between writer and readers.
+  pthread_condattr_t cond_attr;
+  pthread_condattr_init(&cond_attr);
+  pthread_condattr_setpshared(&cond_attr, PTHREAD_PROCESS_SHARED);
+  pthread_cond_init(&cond, &cond_attr);
+}
+
+void PlasmaObjectHeader::Destroy() {
+  RAY_CHECK(pthread_mutex_destroy(&wr_mut) == 0);
+  RAY_CHECK(pthread_cond_destroy(&cond) == 0);
+  RAY_CHECK(sem_destroy(&rw_semaphore) == 0);
+}
+
+// Get the data size of the plasma object.
+// This has to be called only when reader lock is acquired
+// via ReadAcquire.
+uint64_t PlasmaObjectHeader::GetDataSize() const {
+  RAY_CHECK_GE(num_read_releases_remaining, 0)
+      << "ReadAcquire has to be called before calling this method.";
+  return data_size;
+}
+
+void PlasmaObjectHeader::WriteAcquire(int64_t write_version, uint64_t new_size) {
+  RAY_LOG(DEBUG) << "WriteAcquire. version: " << write_version;
+  sem_wait(&rw_semaphore);
+  RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0);
+  PrintPlasmaObjectHeader(this);
+
+  RAY_CHECK(num_read_acquires_remaining == 0);
+  RAY_CHECK(num_read_releases_remaining == 0);
+  RAY_CHECK(write_version == version + 1)
+      << "Write version " << write_version
+      << " is more than 1 greater than current version " << version
+      << ". Are you sure this is the only writer?";
+
+  num_readers = 0;
+  version = write_version;
+  data_size = new_size;
+
+  RAY_LOG(DEBUG) << "WriteAcquire done";
+  PrintPlasmaObjectHeader(this);
+  RAY_CHECK(pthread_mutex_unlock(&wr_mut) == 0);
+}
+
+void PlasmaObjectHeader::WriteRelease(int64_t write_version, int64_t write_num_readers) {
+  RAY_LOG(DEBUG) << "WriteRelease Waiting. version: " << write_version
+                 << " max readers: " << write_num_readers;
+  RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0);
+  RAY_LOG(DEBUG) << "WriteRelease " << write_version
+                 << " max readers: " << write_num_readers;
+  PrintPlasmaObjectHeader(this);
+
+  RAY_CHECK(version == write_version)
+      << "Write version " << write_version << " no longer matches current version "
+      << version << ". Are you sure this is the only writer?";
+
+  version = write_version;
+  num_readers = write_num_readers;
+  num_read_acquires_remaining = num_readers;
+  num_read_releases_remaining = num_readers;
+
+  RAY_LOG(DEBUG) << "WriteRelease done";
+  PrintPlasmaObjectHeader(this);
+  RAY_CHECK(pthread_mutex_unlock(&wr_mut) == 0);
+  // Signal to all readers.
+  RAY_CHECK(pthread_cond_broadcast(&cond) == 0);
+}
+
+int64_t PlasmaObjectHeader::ReadAcquire(int64_t read_version) {
+  RAY_LOG(DEBUG) << "ReadAcquire Waiting" << read_version;
+  RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0);
+  RAY_LOG(DEBUG) << "ReadAcquire " << read_version;
+  PrintPlasmaObjectHeader(this);
+
+  while (version < read_version || num_read_acquires_remaining == 0) {
+    RAY_CHECK(pthread_cond_wait(&cond, &wr_mut) == 0);
+  }
+
+  if (version > read_version) {
+    RAY_LOG(WARNING) << "Version " << version << " already exceeds version to read "
+                     << read_version << ". May have missed earlier reads.";
+  }
+
+  if (num_readers != -1) {
+    num_read_acquires_remaining--;
+    RAY_CHECK(num_read_acquires_remaining >= 0)
+        << "readers acquired exceeds max readers " << num_readers;
+    // This object can only be read a constant number of times. Tell the caller
+    // which version was read.
+    read_version = version;
+  } else {
+    read_version = 0;
+  }
+
+  RAY_LOG(DEBUG) << "ReadAcquire done";
+  PrintPlasmaObjectHeader(this);
+
+  RAY_CHECK(pthread_mutex_unlock(&wr_mut) == 0);
+  // Signal to other readers that they may read.
+  RAY_CHECK(pthread_cond_signal(&cond) == 0);
+  return read_version;
+}
+
+void PlasmaObjectHeader::ReadRelease(int64_t read_version) {
+  bool all_readers_done = false;
+  RAY_LOG(DEBUG) << "ReadRelease Waiting" << read_version;
+  RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0);
+  PrintPlasmaObjectHeader(this);
+
+  RAY_LOG(DEBUG) << "ReadRelease " << read_version << " version is currently " << version;
+  RAY_CHECK(version == read_version) << "Version " << version << " modified from version "
+                                     << read_version << " at read start";
+
+  if (num_readers != -1) {
+    num_read_releases_remaining--;
+    RAY_CHECK(num_read_releases_remaining >= 0);
+    if (num_read_releases_remaining == 0) {
+      all_readers_done = true;
+    }
+  }
+
+  PrintPlasmaObjectHeader(this);
+  RAY_LOG(DEBUG) << "ReadRelease done";
+  RAY_CHECK(pthread_mutex_unlock(&wr_mut) == 0);
+  if (all_readers_done) {
+    sem_post(&rw_semaphore);
+  }
+}
+
+}  // namespace ray
diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h
index 66829d2511ebc..23634cbae7d35 100644
--- a/src/ray/object_manager/common.h
+++ b/src/ray/object_manager/common.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+#include <semaphore.h>
+
+#include <atomic>
 #include <boost/asio.hpp>
 #include <functional>
 
@@ -36,9 +39,74 @@ using RestoreSpilledObjectCallback =
                        const std::string &,
                        std::function<void(const ray::Status &)>)>;
 
+struct PlasmaObjectHeader {
+  // Used to signal to the writer when all readers are done.
+  sem_t rw_semaphore;
+
+  // Protects all following state, used to signal from writer to readers.
+  pthread_mutex_t wr_mut;
+  // Used to signal to readers when the writer is done writing a new version.
+  pthread_cond_t cond;
+  // The object version. For immutable objects, this gets incremented to 1 on
+  // the first write and then should never be modified. For mutable objects,
+  // each new write must increment the version before releasing to readers.
+  int64_t version = 0;
+  // The total number of reads allowed before the writer can write again. This
+  // value should be set by the writer before releasing to readers.
+  // For immutable objects, this is set to -1 and infinite reads are allowed.
+  // Otherwise, readers must acquire/release before/after reading.
+  int64_t num_readers = 0;
+  // The number of readers who can acquire the current version. For mutable
+  // objects, readers must ensure this is > 0 and decrement before they read.
+  // Once this value reaches 0, no more readers are allowed until the writer
+  // writes a new version.
+  int64_t num_read_acquires_remaining = 0;
+  // The number of readers who must release the current version before a new
+  // version can be written. For mutable objects, readers must decrement this
+  // when they are done reading the current version. Once this value reaches 0,
+  // the reader should signal to the writer that they can write again.
+  int64_t num_read_releases_remaining = 0;
+  // The valid data and metadata size of the Ray object.
+  // Not used for immutable objects.
+  // For mutable objects, this should be modified when the new object has a
+  // different data/metadata size.
+  uint64_t data_size = 0;
+  uint64_t metadata_size = 0;
+
+  void Init();
+
+  void Destroy();
+
+  // Blocks until there are no more readers.
+  // NOTE: Caller should ensure there is one writer at a time.
+  /// \param write_version The new version for write.
+  /// \param new_size The new data size of the object.
+  void WriteAcquire(int64_t write_version, uint64_t new_data_size);
+
+  // Call after completing a write to signal to num_readers many readers.
+  void WriteRelease(int64_t write_version, int64_t num_readers);
+
+  // Blocks until the given version or a more recent version is ready to read.
+  //
+  // \param read_version The minimum version to wait for.
+  // \return The version that was read. This should be passed to ReadRelease
+  // when the reader is done.
+  int64_t ReadAcquire(int64_t read_version);
+
+  // Finishes the read. If all reads are done, signals to the
+  // writer. This is not necessary to call for objects that have
+  // num_readers=-1.
+  void ReadRelease(int64_t read_version);
+
+  // Get the data size of the plasma object.
+  // The reader must first ReadAcquire.
+  uint64_t GetDataSize() const;
+};
+
 /// A struct that includes info about the object.
 struct ObjectInfo {
   ObjectID object_id;
+  bool is_mutable;
   int64_t data_size = 0;
   int64_t metadata_size = 0;
   /// Owner's raylet ID.
@@ -50,7 +118,9 @@ struct ObjectInfo {
   /// Owner's worker ID.
   WorkerID owner_worker_id;
 
-  int64_t GetObjectSize() const { return data_size + metadata_size; }
+  int64_t GetObjectSize() const {
+    return sizeof(PlasmaObjectHeader) + data_size + metadata_size;
+  }
 
   bool operator==(const ObjectInfo &other) const {
     return ((object_id == other.object_id) && (data_size == other.data_size) &&
diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index e3274a058df1c..31a507cb3dc92 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -32,6 +32,7 @@
 #include "absl/container/flat_hash_map.h"
 #include "ray/common/asio/instrumented_io_context.h"
 #include "ray/common/ray_config.h"
+#include "ray/object_manager/common.h"
 #include "ray/object_manager/plasma/connection.h"
 #include "ray/object_manager/plasma/plasma.h"
 #include "ray/object_manager/plasma/protocol.h"
@@ -94,6 +95,12 @@ struct ObjectInUseEntry {
   PlasmaObject object;
   /// A flag representing whether the object has been sealed.
   bool is_sealed;
+  bool is_shared = false;
+  /// For shared objects only.
+  /// The last version that we read or wrote. To read or write again, we must
+  /// pass a newer version than this.
+  int64_t next_version_to_read = 1;
+  int64_t next_version_to_write = 1;
 };
 
 class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Impl> {
@@ -145,13 +152,15 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
              ObjectBuffer *object_buffers,
              bool is_from_worker);
 
+  Status GetRelease(const ObjectID &object_id);
+
   Status Release(const ObjectID &object_id);
 
   Status Contains(const ObjectID &object_id, bool *has_object);
 
   Status Abort(const ObjectID &object_id);
 
-  Status Seal(const ObjectID &object_id);
+  Status Seal(const ObjectID &object_id, int64_t num_readers);
 
   Status Delete(const std::vector<ObjectID> &object_ids);
 
@@ -195,10 +204,12 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
                     ObjectBuffer *object_buffers,
                     bool is_from_worker);
 
-  uint8_t *LookupMmappedFile(MEMFD_TYPE store_fd_val);
+  uint8_t *LookupMmappedFile(MEMFD_TYPE store_fd_val) const;
+
+  ray::PlasmaObjectHeader *GetPlasmaObjectHeader(const PlasmaObject &object) const;
 
   void IncrementObjectCount(const ObjectID &object_id,
-                            PlasmaObject *object,
+                            const PlasmaObject *object,
                             bool is_sealed);
 
   /// The boost::asio IO context for the client.
@@ -257,12 +268,19 @@ uint8_t *PlasmaClient::Impl::GetStoreFdAndMmap(MEMFD_TYPE store_fd_val,
 
 // Get a pointer to a file that we know has been memory mapped in this client
 // process before.
-uint8_t *PlasmaClient::Impl::LookupMmappedFile(MEMFD_TYPE store_fd_val) {
+uint8_t *PlasmaClient::Impl::LookupMmappedFile(MEMFD_TYPE store_fd_val) const {
   auto entry = mmap_table_.find(store_fd_val);
   RAY_CHECK(entry != mmap_table_.end());
   return entry->second->pointer();
 }
 
+ray::PlasmaObjectHeader *PlasmaClient::Impl::GetPlasmaObjectHeader(
+    const PlasmaObject &object) const {
+  auto base_ptr = LookupMmappedFile(object.store_fd);
+  auto header_ptr = base_ptr + object.header_offset;
+  return reinterpret_cast<ray::PlasmaObjectHeader *>(header_ptr);
+}
+
 bool PlasmaClient::Impl::IsInUse(const ObjectID &object_id) {
   std::lock_guard<std::recursive_mutex> guard(client_mutex_);
 
@@ -271,13 +289,14 @@ bool PlasmaClient::Impl::IsInUse(const ObjectID &object_id) {
 }
 
 void PlasmaClient::Impl::IncrementObjectCount(const ObjectID &object_id,
-                                              PlasmaObject *object,
+                                              const PlasmaObject *object,
                                               bool is_sealed) {
   // Increment the count of the object to track the fact that it is being used.
   // The corresponding decrement should happen in PlasmaClient::Release.
   auto elem = objects_in_use_.find(object_id);
   ObjectInUseEntry *object_entry;
   if (elem == objects_in_use_.end()) {
+    RAY_CHECK(object != nullptr);
     // Add this object ID to the hash table of object IDs in use. The
     // corresponding call to free happens in PlasmaClient::Release.
     objects_in_use_[object_id] = std::make_unique<ObjectInUseEntry>();
@@ -287,7 +306,8 @@ void PlasmaClient::Impl::IncrementObjectCount(const ObjectID &object_id,
     object_entry = objects_in_use_[object_id].get();
   } else {
     object_entry = elem->second.get();
-    RAY_CHECK(object_entry->count > 0);
+    // TODO(swang): Nicer way to pin shared objects.
+    // RAY_CHECK(object_entry->count > 0);
   }
   // Increment the count of the number of instances of this object that are
   // being used by this client. The corresponding decrement should happen in
@@ -368,6 +388,44 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
                                                   fb::ObjectSource source,
                                                   int device_num) {
   std::unique_lock<std::recursive_mutex> guard(client_mutex_);
+  auto object_entry = objects_in_use_.find(object_id);
+  if (object_entry != objects_in_use_.end()) {
+    auto &entry = object_entry->second;
+    if (entry->is_sealed && entry->is_shared) {
+      RAY_LOG(DEBUG) << "Create shared object " << object_id << " exists";
+      // Wait for no readers.
+      auto plasma_header = GetPlasmaObjectHeader(entry->object);
+      // TODO(sang)
+      // NOTE: entry->object.data_size is the size of the data buffer.
+      // When the object is shared, we can have object size smaller than the data buffer.
+      RAY_LOG(DEBUG) << "SANG-TODO Update the data size of " << object_id
+                     << ". Size: " << data_size;
+      auto next_version_to_write = plasma_header->version + 1;
+      plasma_header->WriteAcquire(next_version_to_write, data_size);
+
+      // Prepare the data buffer and return to the client instead of sending
+      // the IPC to object store.
+      *data = std::make_shared<PlasmaMutableBuffer>(
+          shared_from_this(),
+          GetStoreFdAndMmap(entry->object.store_fd, entry->object.mmap_size) +
+              entry->object.data_offset,
+          entry->object.data_size);
+      // If plasma_create is being called from a transfer, then we will not copy the
+      // metadata here. The metadata will be written along with the data streamed
+      // from the transfer.
+      if (metadata != NULL) {
+        // Copy the metadata to the buffer.
+        memcpy((*data)->Data() + entry->object.data_size,
+               metadata,
+               entry->object.metadata_size);
+      }
+
+      entry->is_sealed = false;
+      IncrementObjectCount(object_id, &entry->object, false);
+    }
+    return Status::OK();
+  }
+
   uint64_t retry_with_request_id = 0;
 
   RAY_LOG(DEBUG) << "called plasma_create on conn " << store_conn_ << " with size "
@@ -394,6 +452,20 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
         object_id, retry_with_request_id, metadata, &retry_with_request_id, data);
   }
 
+  if (status.ok()) {
+    // Create IPC was successful.
+    object_entry = objects_in_use_.find(object_id);
+    RAY_CHECK(object_entry != objects_in_use_.end());
+    auto &entry = object_entry->second;
+    RAY_CHECK(!entry->is_sealed);
+    auto plasma_header = GetPlasmaObjectHeader(entry->object);
+    // The corresponding WriteRelease takes place in Seal.
+    // When an object is first created, the data size is equivalent to
+    // buffer size.
+    // The first creation's version is always 1.
+    plasma_header->WriteAcquire(/*next_version_to_write*/ 1, entry->object.data_size);
+  }
+
   return status;
 }
 
@@ -457,8 +529,19 @@ Status PlasmaClient::Impl::GetBuffers(
       all_present = false;
     } else {
       PlasmaObject *object = &object_entry->second->object;
-      std::shared_ptr<Buffer> physical_buf;
 
+      // Wait for the object to become ready to read.
+      auto plasma_header = GetPlasmaObjectHeader(*object);
+      int64_t version_read =
+          plasma_header->ReadAcquire(object_entry->second->next_version_to_read);
+      auto data_size = plasma_header->GetDataSize();
+      RAY_LOG(DEBUG) << "SANG-TODO data size is " << data_size;
+      if (version_read > 0) {
+        object_entry->second->is_shared = true;
+        object_entry->second->next_version_to_read = version_read;
+      }
+
+      std::shared_ptr<Buffer> physical_buf;
       if (object->device_num == 0) {
         uint8_t *data = LookupMmappedFile(object->store_fd);
         physical_buf = std::make_shared<SharedMemoryBuffer>(
@@ -467,8 +550,7 @@ Status PlasmaClient::Impl::GetBuffers(
         RAY_LOG(FATAL) << "GPU library is not enabled.";
       }
       physical_buf = wrap_buffer(object_ids[i], physical_buf);
-      object_buffers[i].data =
-          SharedMemoryBuffer::Slice(physical_buf, 0, object->data_size);
+      object_buffers[i].data = SharedMemoryBuffer::Slice(physical_buf, 0, data_size);
       object_buffers[i].metadata = SharedMemoryBuffer::Slice(
           physical_buf, object->data_size, object->metadata_size);
       object_buffers[i].device_num = object->device_num;
@@ -525,6 +607,19 @@ Status PlasmaClient::Impl::GetBuffers(
     // If we are here, the object was not currently in use, so we need to
     // process the reply from the object store.
     if (object->data_size != -1) {
+      // Increment the count of the number of instances of this object that this
+      // client is using. Cache the reference to the object.
+      IncrementObjectCount(received_object_ids[i], object, true);
+      auto &object_entry = objects_in_use_[received_object_ids[i]];
+      // Wait for the object to become ready to read.
+      auto plasma_header = GetPlasmaObjectHeader(*object);
+      int64_t version_read = plasma_header->ReadAcquire(/*version=*/1);
+      auto data_size = plasma_header->GetDataSize();
+      if (version_read > 0) {
+        object_entry->is_shared = true;
+        object_entry->next_version_to_read = version_read;
+      }
+
       std::shared_ptr<Buffer> physical_buf;
       if (object->device_num == 0) {
         uint8_t *data = LookupMmappedFile(object->store_fd);
@@ -535,14 +630,10 @@ Status PlasmaClient::Impl::GetBuffers(
       }
       // Finish filling out the return values.
       physical_buf = wrap_buffer(object_ids[i], physical_buf);
-      object_buffers[i].data =
-          SharedMemoryBuffer::Slice(physical_buf, 0, object->data_size);
+      object_buffers[i].data = SharedMemoryBuffer::Slice(physical_buf, 0, data_size);
       object_buffers[i].metadata = SharedMemoryBuffer::Slice(
           physical_buf, object->data_size, object->metadata_size);
       object_buffers[i].device_num = object->device_num;
-      // Increment the count of the number of instances of this object that this
-      // client is using. Cache the reference to the object.
-      IncrementObjectCount(received_object_ids[i], object, true);
     } else {
       // The object was not retrieved.  The caller can detect this condition
       // by checking the boolean value of the metadata/data buffers.
@@ -569,6 +660,29 @@ Status PlasmaClient::Impl::Get(const std::vector<ObjectID> &object_ids,
       &object_ids[0], num_objects, timeout_ms, wrap_buffer, &(*out)[0], is_from_worker);
 }
 
+Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) {
+  RAY_LOG(DEBUG) << "Try to release Get for object " << object_id;
+  std::unique_lock<std::recursive_mutex> guard(client_mutex_);
+  auto object_entry = objects_in_use_.find(object_id);
+  if (object_entry == objects_in_use_.end()) {
+    return Status::ObjectNotFound(
+        "ray.release() called on an object that is not in scope");
+  }
+
+  auto &entry = object_entry->second;
+  //  RAY_CHECK(entry->is_sealed && entry->is_shared) << "ray.release must be called on "
+  //    "objects that are sealed and shared. sealed? " << entry->is_sealed
+  //    << " shared " << entry->is_shared;
+
+  RAY_LOG(DEBUG) << "Release shared object " << object_id;
+  auto plasma_header = GetPlasmaObjectHeader(entry->object);
+  plasma_header->ReadRelease(entry->next_version_to_read);
+  // The next read needs to read at least this version.
+  entry->next_version_to_read++;
+
+  return Status::OK();
+}
+
 Status PlasmaClient::Impl::MarkObjectUnused(const ObjectID &object_id) {
   auto object_entry = objects_in_use_.find(object_id);
   RAY_CHECK(object_entry != objects_in_use_.end());
@@ -592,7 +706,8 @@ Status PlasmaClient::Impl::Release(const ObjectID &object_id) {
   object_entry->second->count -= 1;
   RAY_CHECK(object_entry->second->count >= 0);
   // Check if the client is no longer using this object.
-  if (object_entry->second->count == 0) {
+  // TODO(swang): Nicer way to pin shared objects.
+  if (object_entry->second->count == 0 && !object_entry->second->is_shared) {
     // object_entry is invalidated in MarkObjectUnused, need to read the fd beforehand.
     MEMFD_TYPE fd = object_entry->second->object.store_fd;
     // Tell the store that the client no longer needs the object.
@@ -648,7 +763,7 @@ Status PlasmaClient::Impl::Contains(const ObjectID &object_id, bool *has_object)
   return Status::OK();
 }
 
-Status PlasmaClient::Impl::Seal(const ObjectID &object_id) {
+Status PlasmaClient::Impl::Seal(const ObjectID &object_id, int64_t num_readers) {
   std::lock_guard<std::recursive_mutex> guard(client_mutex_);
 
   // Make sure this client has a reference to the object before sending the
@@ -662,20 +777,33 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) {
     return Status::ObjectAlreadySealed("Seal() called on an already sealed object");
   }
 
+  auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object);
+  // The value should've already updated when object is created.
+  auto next_version_to_write = plasma_header->version;
+  plasma_header->WriteRelease(
+      /*write_version=*/next_version_to_write, num_readers);
+  object_entry->second->next_version_to_write = next_version_to_write;
+
+  if (num_readers != -1) {
+    object_entry->second->is_shared = true;
+  }
   object_entry->second->is_sealed = true;
-  /// Send the seal request to Plasma.
-  RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id));
-  std::vector<uint8_t> buffer;
-  RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer));
-  ObjectID sealed_id;
-  RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id));
-  RAY_CHECK(sealed_id == object_id);
-  // We call PlasmaClient::Release to decrement the number of instances of this
-  // object
-  // that are currently being used by this client. The corresponding increment
-  // happened in plasma_create and was used to ensure that the object was not
-  // released before the call to PlasmaClient::Seal.
-  return Release(object_id);
+  //// Send the seal request to Plasma.
+  // RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id));
+  // std::vector<uint8_t> buffer;
+  // RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer));
+  // ObjectID sealed_id;
+  // RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id));
+  // RAY_CHECK(sealed_id == object_id);
+  //// We call PlasmaClient::Release to decrement the number of instances of this
+  //// object
+  //// that are currently being used by this client. The corresponding increment
+  //// happened in plasma_create and was used to ensure that the object was not
+  //// released before the call to PlasmaClient::Seal.
+  // return Release(object_id);
+
+  // TODO(swang): Release the object if the ref count == 0.
+  return Status::OK();
 }
 
 Status PlasmaClient::Impl::Abort(const ObjectID &object_id) {
@@ -847,6 +975,10 @@ Status PlasmaClient::Get(const std::vector<ObjectID> &object_ids,
   return impl_->Get(object_ids, timeout_ms, object_buffers, is_from_worker);
 }
 
+Status PlasmaClient::GetRelease(const ObjectID &object_id) {
+  return impl_->GetRelease(object_id);
+}
+
 Status PlasmaClient::Release(const ObjectID &object_id) {
   return impl_->Release(object_id);
 }
@@ -857,7 +989,9 @@ Status PlasmaClient::Contains(const ObjectID &object_id, bool *has_object) {
 
 Status PlasmaClient::Abort(const ObjectID &object_id) { return impl_->Abort(object_id); }
 
-Status PlasmaClient::Seal(const ObjectID &object_id) { return impl_->Seal(object_id); }
+Status PlasmaClient::Seal(const ObjectID &object_id, int64_t num_readers) {
+  return impl_->Seal(object_id, num_readers);
+}
 
 Status PlasmaClient::Delete(const ObjectID &object_id) {
   return impl_->Delete(std::vector<ObjectID>{object_id});
diff --git a/src/ray/object_manager/plasma/client.h b/src/ray/object_manager/plasma/client.h
index d466528ecd275..74841df373fee 100644
--- a/src/ray/object_manager/plasma/client.h
+++ b/src/ray/object_manager/plasma/client.h
@@ -82,13 +82,15 @@ class PlasmaClientInterface {
                      std::vector<ObjectBuffer> *object_buffers,
                      bool is_from_worker) = 0;
 
+  virtual Status GetRelease(const ObjectID &object_id) = 0;
+
   /// Seal an object in the object store. The object will be immutable after
   /// this
   /// call.
   ///
   /// \param object_id The ID of the object to seal.
   /// \return The return status.
-  virtual Status Seal(const ObjectID &object_id) = 0;
+  virtual Status Seal(const ObjectID &object_id, int64_t num_readers = -1) = 0;
 
   /// Abort an unsealed object in the object store. If the abort succeeds, then
   /// it will be as if the object was never created at all. The unsealed object
@@ -255,6 +257,8 @@ class PlasmaClient : public PlasmaClientInterface {
              std::vector<ObjectBuffer> *object_buffers,
              bool is_from_worker);
 
+  Status GetRelease(const ObjectID &object_id);
+
   /// Tell Plasma that the client no longer needs the object. This should be
   /// called after Get() or Create() when the client is done with the object.
   /// After this call, the buffer returned by Get() is no longer valid.
@@ -290,7 +294,7 @@ class PlasmaClient : public PlasmaClientInterface {
   ///
   /// \param object_id The ID of the object to seal.
   /// \return The return status.
-  Status Seal(const ObjectID &object_id);
+  Status Seal(const ObjectID &object_id, int64_t num_readers = -1);
 
   /// Delete an object from the object store. This currently assumes that the
   /// object is present, has been sealed and not used by another client. Otherwise,
diff --git a/src/ray/object_manager/plasma/common.h b/src/ray/object_manager/plasma/common.h
index a4e8f8337372b..d74eb88cec8b8 100644
--- a/src/ray/object_manager/plasma/common.h
+++ b/src/ray/object_manager/plasma/common.h
@@ -123,14 +123,21 @@ class LocalObject {
 
   const plasma::flatbuf::ObjectSource &GetSource() const { return source; }
 
+  ray::PlasmaObjectHeader *GetPlasmaObjectHeader() const {
+    auto header_ptr = static_cast<uint8_t *>(allocation.address);
+    return reinterpret_cast<ray::PlasmaObjectHeader *>(header_ptr);
+  }
+
   void ToPlasmaObject(PlasmaObject *object, bool check_sealed) const {
     RAY_DCHECK(object != nullptr);
     if (check_sealed) {
       RAY_DCHECK(Sealed());
     }
     object->store_fd = GetAllocation().fd;
-    object->data_offset = GetAllocation().offset;
-    object->metadata_offset = GetAllocation().offset + GetObjectInfo().data_size;
+    object->header_offset = GetAllocation().offset;
+    object->data_offset = GetAllocation().offset + sizeof(ray::PlasmaObjectHeader);
+    object->metadata_offset = GetAllocation().offset + sizeof(ray::PlasmaObjectHeader) +
+                              GetObjectInfo().data_size;
     object->data_size = GetObjectInfo().data_size;
     object->metadata_size = GetObjectInfo().metadata_size;
     object->device_num = GetAllocation().device_num;
diff --git a/src/ray/object_manager/plasma/object_store.cc b/src/ray/object_manager/plasma/object_store.cc
index a36ad1d54906a..260951f178567 100644
--- a/src/ray/object_manager/plasma/object_store.cc
+++ b/src/ray/object_manager/plasma/object_store.cc
@@ -47,6 +47,10 @@ const LocalObject *ObjectStore::CreateObject(const ray::ObjectInfo &object_info,
   entry->construct_duration = -1;
   entry->source = source;
 
+  auto plasma_header = entry->GetPlasmaObjectHeader();
+  *plasma_header = ray::PlasmaObjectHeader{};
+  plasma_header->Init();
+
   RAY_LOG(DEBUG) << "create object " << object_info.object_id << " succeeded";
   return entry;
 }
@@ -74,6 +78,10 @@ bool ObjectStore::DeleteObject(const ObjectID &object_id) {
   if (entry == nullptr) {
     return false;
   }
+  // TODO(swang): Make sure Seal coroutine is done before deleting.
+  auto plasma_header = entry->GetPlasmaObjectHeader();
+  plasma_header->Destroy();
+
   allocator_.Free(std::move(entry->allocation));
   object_table_.erase(object_id);
   return true;
diff --git a/src/ray/object_manager/plasma/plasma.fbs b/src/ray/object_manager/plasma/plasma.fbs
index e5e7714aebc20..68177465f3366 100644
--- a/src/ray/object_manager/plasma/plasma.fbs
+++ b/src/ray/object_manager/plasma/plasma.fbs
@@ -96,6 +96,8 @@ struct PlasmaObjectSpec {
   segment_index: int;
   // The unique id of the segment fd in case of fd reuse.
   unique_fd_id: long;
+  // The offset in bytes in the memory mapped file of the plasma object header.
+  header_offset: ulong;
   // The offset in bytes in the memory mapped file of the data.
   data_offset: ulong;
   // The size in bytes of the data.
diff --git a/src/ray/object_manager/plasma/plasma.h b/src/ray/object_manager/plasma/plasma.h
index 0f8a00b061424..775226c922665 100644
--- a/src/ray/object_manager/plasma/plasma.h
+++ b/src/ray/object_manager/plasma/plasma.h
@@ -37,6 +37,9 @@ struct PlasmaObject {
   /// a unique identifier of the file in the client to look up the corresponding
   /// file descriptor on the client's side.
   MEMFD_TYPE store_fd;
+  /// The offset in bytes in the memory mapped file of the plasma object
+  /// header.
+  ptrdiff_t header_offset;
   /// The offset in bytes in the memory mapped file of the data.
   ptrdiff_t data_offset;
   /// The offset in bytes in the memory mapped file of the metadata.
diff --git a/src/ray/object_manager/plasma/plasma_allocator.cc b/src/ray/object_manager/plasma/plasma_allocator.cc
index 3737024ab416a..06cdb20bf3d5d 100644
--- a/src/ray/object_manager/plasma/plasma_allocator.cc
+++ b/src/ray/object_manager/plasma/plasma_allocator.cc
@@ -75,7 +75,7 @@ PlasmaAllocator::PlasmaAllocator(const std::string &plasma_directory,
   auto allocation = Allocate(kFootprintLimit - kDlMallocReserved);
   RAY_CHECK(allocation.has_value())
       << "PlasmaAllocator initialization failed."
-      << " It's likely we don't have enought space in " << plasma_directory;
+      << " It's likely we don't have enough space in " << plasma_directory;
   // This will unmap the file, but the next one created will be as large
   // as this one (this is an implementation detail of dlmalloc).
   Free(std::move(allocation.value()));
diff --git a/src/ray/object_manager/plasma/protocol.cc b/src/ray/object_manager/plasma/protocol.cc
index 50595cde53701..79b9a27827fb1 100644
--- a/src/ray/object_manager/plasma/protocol.cc
+++ b/src/ray/object_manager/plasma/protocol.cc
@@ -260,6 +260,7 @@ Status SendCreateReply(const std::shared_ptr<Client> &client,
   flatbuffers::FlatBufferBuilder fbb;
   PlasmaObjectSpec plasma_object(FD2INT(object.store_fd.first),
                                  object.store_fd.second,
+                                 object.header_offset,
                                  object.data_offset,
                                  object.data_size,
                                  object.metadata_offset,
@@ -300,6 +301,7 @@ Status ReadCreateReply(uint8_t *data,
 
   object->store_fd.first = INT2FD(message->plasma_object()->segment_index());
   object->store_fd.second = message->plasma_object()->unique_fd_id();
+  object->header_offset = message->plasma_object()->header_offset();
   object->data_offset = message->plasma_object()->data_offset();
   object->data_size = message->plasma_object()->data_size();
   object->metadata_offset = message->plasma_object()->metadata_offset();
@@ -614,6 +616,7 @@ Status SendGetReply(const std::shared_ptr<Client> &client,
                    << " metadata_size: " << object.metadata_size;
     objects.push_back(PlasmaObjectSpec(FD2INT(object.store_fd.first),
                                        object.store_fd.second,
+                                       object.header_offset,
                                        object.data_offset,
                                        object.data_size,
                                        object.metadata_offset,
@@ -654,6 +657,7 @@ Status ReadGetReply(uint8_t *data,
     const PlasmaObjectSpec *object = message->plasma_objects()->Get(i);
     plasma_objects[i].store_fd.first = INT2FD(object->segment_index());
     plasma_objects[i].store_fd.second = object->unique_fd_id();
+    plasma_objects[i].header_offset = object->header_offset();
     plasma_objects[i].data_offset = object->data_offset();
     plasma_objects[i].data_size = object->data_size();
     plasma_objects[i].metadata_offset = object->metadata_offset();
diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc
index 66876de42cbcc..e948e885aecfa 100644
--- a/src/ray/object_manager/plasma/store.cc
+++ b/src/ray/object_manager/plasma/store.cc
@@ -31,7 +31,9 @@
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/eventfd.h>
 
+#include <boost/asio/spawn.hpp>
 #include <boost/bind/bind.hpp>
 #include <chrono>
 #include <ctime>
@@ -525,11 +527,65 @@ void PlasmaStore::ReplyToCreateClient(const std::shared_ptr<Client> &client,
         error == PlasmaError::OK && result.device_num == 0) {
       static_cast<void>(client->SendFd(result.store_fd));
     }
+
+    WaitForSeal(object_id, client);
   } else {
     static_cast<void>(SendUnfinishedCreateReply(client, object_id, req_id));
   }
 }
 
+void PlasmaStore::WaitForSeal(const ObjectID &object_id,
+                              const std::shared_ptr<Client> &client) {
+  auto entry = object_lifecycle_mgr_.GetObject(object_id);
+  RAY_CHECK(entry);
+  auto plasma_header = entry->GetPlasmaObjectHeader();
+
+  int event_fd = eventfd(0, EFD_CLOEXEC);
+  RAY_CHECK(event_fd != -1);
+
+  auto wait_fn = [event_fd, plasma_header]() {
+    plasma_header->ReadAcquire(/*read_version=*/1);
+
+    uint64_t data = 1;
+    auto num_bytes_written = write(event_fd, &data, sizeof(data));
+    // TODO(swang): Need proper error checking here.
+    if (num_bytes_written != sizeof(data)) {
+      RAY_LOG(WARNING) << num_bytes_written << " bytes written on fd " << event_fd
+                       << "  err: " << strerror(errno);
+    }
+  };
+
+  auto wait_thread = std::make_shared<std::thread>(wait_fn);
+
+  boost::asio::spawn(
+      io_context_,
+      [this, event_fd, object_id, plasma_header, wait_thread, client](
+          boost::asio::yield_context yield) {
+        auto event_stream = std::make_shared<boost::asio::posix::stream_descriptor>(
+            io_context_, event_fd);
+        auto data = std::make_shared<uint64_t>(0);
+        auto buf = boost::asio::buffer(data.get(), sizeof(*data));
+        boost::asio::async_read(
+            *event_stream,
+            buf,
+            [this, event_stream, data, object_id, event_fd, wait_thread](
+                const boost::system::error_code &ec, size_t bytes_transferred) {
+              RAY_CHECK(bytes_transferred == sizeof(*data)) << ec.message();
+
+              // RAY_CHECK(plasma_header->num_readers == -1) <<
+              // plasma_header->num_readers;
+
+              {
+                absl::MutexLock lock(&mutex_);
+                SealObjects({object_id});
+              }
+
+              wait_thread->join();
+              close(event_fd);
+            });
+      });
+}
+
 int64_t PlasmaStore::GetConsumedBytes() { return total_consumed_bytes_; }
 
 bool PlasmaStore::IsObjectSpillable(const ObjectID &object_id) {
diff --git a/src/ray/object_manager/plasma/store.h b/src/ray/object_manager/plasma/store.h
index a6c992c131280..74c33edb9a2c6 100644
--- a/src/ray/object_manager/plasma/store.h
+++ b/src/ray/object_manager/plasma/store.h
@@ -118,6 +118,8 @@ class PlasmaStore {
     return available;
   }
 
+  void WaitForSeal(const ObjectID &object_id, const std::shared_ptr<Client> &client);
+
  private:
   /// Create a new object. The client must do a call to release_object to tell
   /// the store when it is done with the object.

From 1c935b9a62b694578a2123bb24608b514391fab9 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 29 Nov 2023 15:57:05 -0800
Subject: [PATCH 06/66] Add special calls for create and put mutable objects

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/__init__.py                        |   2 +
 python/ray/_private/worker.py                 |  57 +++++-
 python/ray/_raylet.pxd                        |   3 +-
 python/ray/_raylet.pyx                        |  49 ++++--
 python/ray/includes/libcoreworker.pxd         |  13 +-
 python/ray/tests/test_accelerated_dag.py      |  10 +-
 src/ray/core_worker/core_worker.cc            |  25 ++-
 src/ray/core_worker/core_worker.h             |  13 +-
 .../store_provider/plasma_store_provider.cc   |  23 ++-
 .../store_provider/plasma_store_provider.h    |  11 +-
 src/ray/object_manager/common.cc              |  14 +-
 src/ray/object_manager/common.h               |   5 +-
 src/ray/object_manager/object_buffer_pool.cc  |   1 +
 src/ray/object_manager/plasma/client.cc       | 163 +++++++++++-------
 src/ray/object_manager/plasma/client.h        |  20 ++-
 src/ray/object_manager/plasma/object_store.cc |   7 +
 src/ray/object_manager/plasma/plasma.fbs      |   2 +
 src/ray/object_manager/plasma/protocol.cc     |   3 +
 src/ray/object_manager/plasma/protocol.h      |   1 +
 src/ray/object_manager/plasma/store.cc        |  54 ++----
 src/ray/object_manager/plasma/store.h         |   2 +
 21 files changed, 324 insertions(+), 154 deletions(-)

diff --git a/python/ray/__init__.py b/python/ray/__init__.py
index 031f8054cf8e7..d95d2f5a20c5b 100644
--- a/python/ray/__init__.py
+++ b/python/ray/__init__.py
@@ -114,6 +114,8 @@ def _configure_system():
     WORKER_MODE,
     RESTORE_WORKER_MODE,
     SPILL_WORKER_MODE,
+    _create_mutable_object,
+    _put_mutable_object,
     cancel,
     get,
     get_actor,
diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py
index 94ab25be014b0..4e5b08f6dedb1 100644
--- a/python/ray/_private/worker.py
+++ b/python/ray/_private/worker.py
@@ -688,7 +688,7 @@ def set_mode(self, mode):
     def set_load_code_from_local(self, load_code_from_local):
         self._load_code_from_local = load_code_from_local
 
-    def put_object(self, value, object_ref=None, owner_address=None, max_readers=-1):
+    def put_object(self, value, object_ref=None, owner_address=None, is_mutable=False):
         """Put value in the local object store with object reference `object_ref`.
 
         This assumes that the value for `object_ref` has not yet been placed in
@@ -736,6 +736,11 @@ def put_object(self, value, object_ref=None, owner_address=None, max_readers=-1)
                 f"{sio.getvalue()}"
             )
             raise TypeError(msg) from e
+
+        # If the object is mutable, then the raylet should never read the
+        # object. Instead, clients will keep the object pinned.
+        pin_object = not is_mutable
+
         # This *must* be the first place that we construct this python
         # ObjectRef because an entry with 0 local references is created when
         # the object is Put() in the core worker, expecting that this python
@@ -746,8 +751,9 @@ def put_object(self, value, object_ref=None, owner_address=None, max_readers=-1)
             self.core_worker.put_serialized_object_and_increment_local_ref(
                 serialized_value,
                 object_ref=object_ref,
+                pin_object=pin_object,
                 owner_address=owner_address,
-                max_readers=max_readers,
+                is_mutable=is_mutable,
             ),
             # The initial local reference is already acquired internally.
             skip_adding_local_ref=True,
@@ -2629,13 +2635,54 @@ def get(
         return values
 
 
+@PublicAPI
+def _put_mutable_object(value: Any, object_ref: ObjectRef, num_readers: int):
+    worker = global_worker
+    worker.check_connected()
+
+    try:
+        serialized_value = worker.get_serialization_context().serialize(value)
+    except TypeError as e:
+        sio = io.StringIO()
+        ray.util.inspect_serializability(value, print_file=sio)
+        msg = (
+            "Could not serialize the put value " f"{repr(value)}:\n" f"{sio.getvalue()}"
+        )
+        raise TypeError(msg) from e
+
+    worker.core_worker.put_serialized_object_to_mutable_plasma_object(
+        serialized_value,
+        object_ref,
+        num_readers,
+    )
+
+
+@PublicAPI
+def _create_mutable_object(
+    buffer_size: int,
+) -> "ray.ObjectRef":
+    worker = global_worker
+    worker.check_connected()
+
+    value = b"0" * buffer_size
+
+    try:
+        object_ref = worker.put_object(value, owner_address=None, is_mutable=True)
+    except ObjectStoreFullError:
+        logger.info(
+            "Put failed since the value was either too large or the "
+            "store was full of pinned objects."
+        )
+        raise
+    return object_ref
+
+
 @PublicAPI
 @client_mode_hook
 def put(
     value: Any,
     *,
     _owner: Optional["ray.actor.ActorHandle"] = None,
-    max_readers=-1,
 ) -> "ray.ObjectRef":
     """Store an object in the object store.
 
@@ -2681,9 +2728,7 @@ def put(
 
     with profiling.profile("ray.put"):
         try:
-            object_ref = worker.put_object(
-                value, owner_address=serialize_owner_address, max_readers=max_readers
-            )
+            object_ref = worker.put_object(value, owner_address=serialize_owner_address)
         except ObjectStoreFullError:
             logger.info(
                 "Put failed since the value was either too large or the "
diff --git a/python/ray/_raylet.pxd b/python/ray/_raylet.pxd
index 015c636454dfe..f4f54cffacec0 100644
--- a/python/ray/_raylet.pxd
+++ b/python/ray/_raylet.pxd
@@ -134,7 +134,8 @@ cdef class CoreWorker:
                             CObjectID *c_object_id, shared_ptr[CBuffer] *data,
                             c_bool created_by_worker,
                             owner_address=*,
-                            c_bool inline_small_object=*)
+                            c_bool inline_small_object=*,
+                            c_bool is_mutable=*)
     cdef unique_ptr[CAddress] _convert_python_address(self, address=*)
     cdef store_task_output(
             self, serialized_object,
diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index e55a29dd08226..3aa2422180c07 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -3373,7 +3373,9 @@ cdef class CoreWorker:
                             CObjectID *c_object_id, shared_ptr[CBuffer] *data,
                             c_bool created_by_worker,
                             owner_address=None,
-                            c_bool inline_small_object=True):
+                            c_bool inline_small_object=True,
+                            c_bool is_mutable=False,
+                            ):
         cdef:
             unique_ptr[CAddress] c_owner_address
 
@@ -3383,7 +3385,7 @@ cdef class CoreWorker:
             with nogil:
                 check_status(CCoreWorkerProcess.GetCoreWorker()
                              .CreateOwnedAndIncrementLocalRef(
-                             metadata, data_size, contained_ids,
+                             is_mutable, metadata, data_size, contained_ids,
                              c_object_id, data, created_by_worker,
                              move(c_owner_address),
                              inline_small_object))
@@ -3470,15 +3472,42 @@ cdef class CoreWorker:
                 CCoreWorkerProcess.GetCoreWorker().SealExisting(
                             c_object_id, pin_object=False,
                             generator_id=CObjectID.Nil(),
-                            owner_address=c_owner_address,
-                            max_readers=-1))
+                            owner_address=c_owner_address))
+
+    def put_serialized_object_to_mutable_plasma_object(self, serialized_object,
+                                                       ObjectRef object_ref,
+                                                       num_readers,
+                                                       ):
+        cdef:
+            CObjectID c_object_id = object_ref.native()
+            shared_ptr[CBuffer] data
+            unique_ptr[CAddress] null_owner_address
+
+        metadata = string_to_buffer(serialized_object.metadata)
+        data_size = serialized_object.total_bytes
+        check_status(CCoreWorkerProcess.GetCoreWorker().WriteAcquireMutableObject(
+                c_object_id,
+                metadata,
+                data_size,
+                num_readers,
+                &data,
+                ))
+        if data_size > 0:
+            (<SerializedObject>serialized_object).write_to(
+                Buffer.make(data))
+        check_status(
+            CCoreWorkerProcess.GetCoreWorker().SealExisting(
+                        c_object_id, pin_object=False,
+                        generator_id=CObjectID.Nil(),
+                        owner_address=null_owner_address))
 
     def put_serialized_object_and_increment_local_ref(self, serialized_object,
                                                       ObjectRef object_ref=None,
                                                       c_bool pin_object=True,
                                                       owner_address=None,
                                                       c_bool inline_small_object=True,
-                                                      max_readers=-1):
+                                                      c_bool is_mutable=False,
+                                                      ):
         cdef:
             CObjectID c_object_id
             shared_ptr[CBuffer] data
@@ -3486,7 +3515,6 @@ cdef class CoreWorker:
             unique_ptr[CAddress] c_owner_address
             c_vector[CObjectID] contained_object_ids
             c_vector[CObjectReference] contained_object_refs
-            int64_t c_max_readers = max_readers
 
         metadata = string_to_buffer(serialized_object.metadata)
         total_bytes = serialized_object.total_bytes
@@ -3495,7 +3523,8 @@ cdef class CoreWorker:
         object_already_exists = self._create_put_buffer(
             metadata, total_bytes, object_ref,
             contained_object_ids,
-            &c_object_id, &data, True, owner_address, inline_small_object)
+            &c_object_id, &data, True, owner_address, inline_small_object,
+            is_mutable)
 
         logger.debug(
             f"Serialized object size of {c_object_id.Hex()} is {total_bytes} bytes")
@@ -3524,8 +3553,7 @@ cdef class CoreWorker:
                             CCoreWorkerProcess.GetCoreWorker().SealOwned(
                                         c_object_id,
                                         pin_object,
-                                        move(c_owner_address),
-                                        c_max_readers))
+                                        move(c_owner_address)))
                     else:
                         # Using custom object refs is not supported because we
                         # can't track their lifecycle, so we don't pin the
@@ -3534,8 +3562,7 @@ cdef class CoreWorker:
                             CCoreWorkerProcess.GetCoreWorker().SealExisting(
                                         c_object_id, pin_object=False,
                                         generator_id=CObjectID.Nil(),
-                                        owner_address=move(c_owner_address),
-                                        max_readers=c_max_readers))
+                                        owner_address=move(c_owner_address)))
 
         return c_object_id.Binary()
 
diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd
index 28fa6375212bb..bf6a6e810fc9d 100644
--- a/python/ray/includes/libcoreworker.pxd
+++ b/python/ray/includes/libcoreworker.pxd
@@ -226,6 +226,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
                        const c_vector[CObjectID] &contained_object_ids,
                        const CObjectID &object_id)
         CRayStatus CreateOwnedAndIncrementLocalRef(
+                    c_bool is_mutable,
                     const shared_ptr[CBuffer] &metadata,
                     const size_t data_size,
                     const c_vector[CObjectID] &contained_object_ids,
@@ -239,13 +240,17 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
                                   const CAddress &owner_address,
                                   shared_ptr[CBuffer] *data,
                                   c_bool created_by_worker)
+        CRayStatus WriteAcquireMutableObject(
+                                  const CObjectID &object_id,
+                                  const shared_ptr[CBuffer] &metadata,
+                                  uint64_t data_size,
+                                  int64_t num_readers,
+                                  shared_ptr[CBuffer] *data)
         CRayStatus SealOwned(const CObjectID &object_id, c_bool pin_object,
-                             const unique_ptr[CAddress] &owner_address,
-                             int64_t max_readers)
+                             const unique_ptr[CAddress] &owner_address)
         CRayStatus SealExisting(const CObjectID &object_id, c_bool pin_object,
                                 const CObjectID &generator_id,
-                                const unique_ptr[CAddress] &owner_address,
-                                int64_t max_readers)
+                                const unique_ptr[CAddress] &owner_address)
         CRayStatus GetRelease(const c_vector[CObjectID] &object_ids)
         CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms,
                        c_vector[shared_ptr[CRayObject]] *results)
diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py
index 8f6286a3c5351..405d78d46cc9b 100644
--- a/python/ray/tests/test_accelerated_dag.py
+++ b/python/ray/tests/test_accelerated_dag.py
@@ -12,12 +12,10 @@
 
 
 def test_put_mutable_object(ray_start_cluster):
-    # ref = ray.create_mutable_object(size_bytes=1000)
-
-    max_readers = 1
-    arr = b"binary"
-    ref = ray.put(arr, max_readers=max_readers)
-    ray.release(ref)
+    ray.init()
+    ref = ray._create_mutable_object(1000)
+    ray._put_mutable_object(b"hello", ref, num_readers=1)
+    assert ray.get(ref) == b"hello"
 
 
 if __name__ == "__main__":
diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc
index 586b69714eb35..72f5b698d9234 100644
--- a/src/ray/core_worker/core_worker.cc
+++ b/src/ray/core_worker/core_worker.cc
@@ -1219,6 +1219,7 @@ Status CoreWorker::Put(const RayObject &object,
 }
 
 Status CoreWorker::CreateOwnedAndIncrementLocalRef(
+    bool is_mutable,
     const std::shared_ptr<Buffer> &metadata,
     const size_t data_size,
     const std::vector<ObjectID> &contained_object_ids,
@@ -1293,7 +1294,8 @@ Status CoreWorker::CreateOwnedAndIncrementLocalRef(
                                               *object_id,
                                               /* owner_address = */ real_owner_address,
                                               data,
-                                              created_by_worker);
+                                              created_by_worker,
+                                              is_mutable);
     }
     if (!status.ok()) {
       RemoveLocalReference(*object_id);
@@ -1324,12 +1326,20 @@ Status CoreWorker::CreateExisting(const std::shared_ptr<Buffer> &metadata,
   }
 }
 
+Status CoreWorker::WriteAcquireMutableObject(const ObjectID &object_id,
+                                             const std::shared_ptr<Buffer> &metadata,
+                                             uint64_t data_size,
+                                             int64_t num_readers,
+                                             std::shared_ptr<Buffer> *data) {
+  return plasma_store_provider_->WriteAcquireMutableObject(
+      object_id, metadata, data_size, num_readers, data);
+}
+
 Status CoreWorker::SealOwned(const ObjectID &object_id,
                              bool pin_object,
-                             const std::unique_ptr<rpc::Address> &owner_address,
-                             int64_t max_readers) {
-  auto status = SealExisting(
-      object_id, pin_object, ObjectID::Nil(), std::move(owner_address), max_readers);
+                             const std::unique_ptr<rpc::Address> &owner_address) {
+  auto status =
+      SealExisting(object_id, pin_object, ObjectID::Nil(), std::move(owner_address));
   if (status.ok()) return status;
   RemoveLocalReference(object_id);
   if (reference_counter_->HasReference(object_id)) {
@@ -1343,9 +1353,8 @@ Status CoreWorker::SealOwned(const ObjectID &object_id,
 Status CoreWorker::SealExisting(const ObjectID &object_id,
                                 bool pin_object,
                                 const ObjectID &generator_id,
-                                const std::unique_ptr<rpc::Address> &owner_address,
-                                int64_t max_readers) {
-  RAY_RETURN_NOT_OK(plasma_store_provider_->Seal(object_id, max_readers));
+                                const std::unique_ptr<rpc::Address> &owner_address) {
+  RAY_RETURN_NOT_OK(plasma_store_provider_->Seal(object_id));
   if (pin_object) {
     // Tell the raylet to pin the object **after** it is created.
     RAY_LOG(DEBUG) << "Pinning sealed object " << object_id;
diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h
index d4f621acd6380..3db71661d8695 100644
--- a/src/ray/core_worker/core_worker.h
+++ b/src/ray/core_worker/core_worker.h
@@ -614,6 +614,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
   /// small.
   /// \return Status.
   Status CreateOwnedAndIncrementLocalRef(
+      bool is_mutable,
       const std::shared_ptr<Buffer> &metadata,
       const size_t data_size,
       const std::vector<ObjectID> &contained_object_ids,
@@ -642,6 +643,12 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
                         std::shared_ptr<Buffer> *data,
                         bool created_by_worker);
 
+  Status WriteAcquireMutableObject(const ObjectID &object_id,
+                                   const std::shared_ptr<Buffer> &metadata,
+                                   uint64_t data_size,
+                                   int64_t num_readers,
+                                   std::shared_ptr<Buffer> *data);
+
   /// Finalize placing an object into the object store. This should be called after
   /// a corresponding `CreateOwned()` call and then writing into the returned buffer.
   ///
@@ -656,8 +663,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
   /// \return Status.
   Status SealOwned(const ObjectID &object_id,
                    bool pin_object,
-                   const std::unique_ptr<rpc::Address> &owner_address = nullptr,
-                   int64_t max_readers = -1);
+                   const std::unique_ptr<rpc::Address> &owner_address = nullptr);
 
   /// Finalize placing an object into the object store. This should be called after
   /// a corresponding `CreateExisting()` call and then writing into the returned buffer.
@@ -674,8 +680,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
   Status SealExisting(const ObjectID &object_id,
                       bool pin_object,
                       const ObjectID &generator_id = ObjectID::Nil(),
-                      const std::unique_ptr<rpc::Address> &owner_address = nullptr,
-                      int64_t max_readers = -1);
+                      const std::unique_ptr<rpc::Address> &owner_address = nullptr);
 
   Status GetRelease(const std::vector<ObjectID> &object_ids);
 
diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.cc b/src/ray/core_worker/store_provider/plasma_store_provider.cc
index eb440a668a9dd..30ae14daef662 100644
--- a/src/ray/core_worker/store_provider/plasma_store_provider.cc
+++ b/src/ray/core_worker/store_provider/plasma_store_provider.cc
@@ -108,12 +108,27 @@ Status CoreWorkerPlasmaStoreProvider::Put(const RayObject &object,
   return Status::OK();
 }
 
+Status CoreWorkerPlasmaStoreProvider::WriteAcquireMutableObject(
+    const ObjectID &object_id,
+    const std::shared_ptr<Buffer> &metadata,
+    uint64_t data_size,
+    int64_t num_readers,
+    std::shared_ptr<Buffer> *data) {
+  return store_client_.WriteAcquireMutableObject(object_id,
+                                                 data_size,
+                                                 metadata ? metadata->Data() : nullptr,
+                                                 metadata ? metadata->Size() : 0,
+                                                 num_readers,
+                                                 data);
+}
+
 Status CoreWorkerPlasmaStoreProvider::Create(const std::shared_ptr<Buffer> &metadata,
                                              const size_t data_size,
                                              const ObjectID &object_id,
                                              const rpc::Address &owner_address,
                                              std::shared_ptr<Buffer> *data,
-                                             bool created_by_worker) {
+                                             bool created_by_worker,
+                                             bool is_mutable) {
   auto source = plasma::flatbuf::ObjectSource::CreatedByWorker;
   if (!created_by_worker) {
     source = plasma::flatbuf::ObjectSource::RestoredFromStorage;
@@ -121,6 +136,7 @@ Status CoreWorkerPlasmaStoreProvider::Create(const std::shared_ptr<Buffer> &meta
   Status status =
       store_client_.CreateAndSpillIfNeeded(object_id,
                                            owner_address,
+                                           is_mutable,
                                            data_size,
                                            metadata ? metadata->Data() : nullptr,
                                            metadata ? metadata->Size() : 0,
@@ -153,9 +169,8 @@ Status CoreWorkerPlasmaStoreProvider::Create(const std::shared_ptr<Buffer> &meta
   return status;
 }
 
-Status CoreWorkerPlasmaStoreProvider::Seal(const ObjectID &object_id,
-                                           int64_t max_readers) {
-  return store_client_.Seal(object_id, max_readers);
+Status CoreWorkerPlasmaStoreProvider::Seal(const ObjectID &object_id) {
+  return store_client_.Seal(object_id);
 }
 
 Status CoreWorkerPlasmaStoreProvider::Release(const ObjectID &object_id) {
diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.h b/src/ray/core_worker/store_provider/plasma_store_provider.h
index 523aa86a3e5f0..2c7242a02f4a1 100644
--- a/src/ray/core_worker/store_provider/plasma_store_provider.h
+++ b/src/ray/core_worker/store_provider/plasma_store_provider.h
@@ -126,7 +126,14 @@ class CoreWorkerPlasmaStoreProvider {
                 const ObjectID &object_id,
                 const rpc::Address &owner_address,
                 std::shared_ptr<Buffer> *data,
-                bool created_by_worker);
+                bool created_by_worker,
+                bool is_mutable = false);
+
+  Status WriteAcquireMutableObject(const ObjectID &object_id,
+                                   const std::shared_ptr<Buffer> &metadata,
+                                   uint64_t data_size,
+                                   int64_t num_readers,
+                                   std::shared_ptr<Buffer> *data);
 
   /// Seal an object buffer created with Create().
   ///
@@ -135,7 +142,7 @@ class CoreWorkerPlasmaStoreProvider {
   ///
   /// \param[in] object_id The ID of the object. This can be used as an
   /// argument to Get to retrieve the object data.
-  Status Seal(const ObjectID &object_id, int64_t max_readers = -1);
+  Status Seal(const ObjectID &object_id);
 
   /// Release the first reference to the object created by Put() or Create(). This should
   /// be called exactly once per object and until it is called, the object is pinned and
diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc
index 4eff0d3e583b4..24e906d9ba4d5 100644
--- a/src/ray/object_manager/common.cc
+++ b/src/ray/object_manager/common.cc
@@ -67,12 +67,10 @@ void PlasmaObjectHeader::WriteAcquire(int64_t write_version, uint64_t new_size)
   RAY_CHECK(pthread_mutex_unlock(&wr_mut) == 0);
 }
 
-void PlasmaObjectHeader::WriteRelease(int64_t write_version, int64_t write_num_readers) {
-  RAY_LOG(DEBUG) << "WriteRelease Waiting. version: " << write_version
-                 << " max readers: " << write_num_readers;
+void PlasmaObjectHeader::WriteRelease(int64_t write_version) {
+  RAY_LOG(DEBUG) << "WriteRelease Waiting. version: " << write_version;
   RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0);
-  RAY_LOG(DEBUG) << "WriteRelease " << write_version
-                 << " max readers: " << write_num_readers;
+  RAY_LOG(DEBUG) << "WriteRelease " << write_version;
   PrintPlasmaObjectHeader(this);
 
   RAY_CHECK(version == write_version)
@@ -80,11 +78,11 @@ void PlasmaObjectHeader::WriteRelease(int64_t write_version, int64_t write_num_r
       << version << ". Are you sure this is the only writer?";
 
   version = write_version;
-  num_readers = write_num_readers;
+  RAY_CHECK(num_readers != 0);
   num_read_acquires_remaining = num_readers;
   num_read_releases_remaining = num_readers;
 
-  RAY_LOG(DEBUG) << "WriteRelease done";
+  RAY_LOG(DEBUG) << "WriteRelease done, num_readers: " << num_readers;
   PrintPlasmaObjectHeader(this);
   RAY_CHECK(pthread_mutex_unlock(&wr_mut) == 0);
   // Signal to all readers.
@@ -92,7 +90,7 @@ void PlasmaObjectHeader::WriteRelease(int64_t write_version, int64_t write_num_r
 }
 
 int64_t PlasmaObjectHeader::ReadAcquire(int64_t read_version) {
-  RAY_LOG(DEBUG) << "ReadAcquire Waiting" << read_version;
+  RAY_LOG(DEBUG) << "ReadAcquire waiting version " << read_version;
   RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0);
   RAY_LOG(DEBUG) << "ReadAcquire " << read_version;
   PrintPlasmaObjectHeader(this);
diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h
index 23634cbae7d35..395c86ee8223b 100644
--- a/src/ray/object_manager/common.h
+++ b/src/ray/object_manager/common.h
@@ -83,8 +83,9 @@ struct PlasmaObjectHeader {
   /// \param new_size The new data size of the object.
   void WriteAcquire(int64_t write_version, uint64_t new_data_size);
 
-  // Call after completing a write to signal to num_readers many readers.
-  void WriteRelease(int64_t write_version, int64_t num_readers);
+  // Call after completing a write to signal that readers may read.
+  // num_readers should be set before calling this.
+  void WriteRelease(int64_t write_version);
 
   // Blocks until the given version or a more recent version is ready to read.
   //
diff --git a/src/ray/object_manager/object_buffer_pool.cc b/src/ray/object_manager/object_buffer_pool.cc
index 8004fb588811d..a42a921fc50a7 100644
--- a/src/ray/object_manager/object_buffer_pool.cc
+++ b/src/ray/object_manager/object_buffer_pool.cc
@@ -241,6 +241,7 @@ ray::Status ObjectBufferPool::EnsureBufferExists(const ObjectID &object_id,
   Status s = store_client_->CreateAndSpillIfNeeded(
       object_id,
       owner_address,
+      /*is_mutable=*/false,
       static_cast<int64_t>(object_size),
       nullptr,
       static_cast<int64_t>(metadata_size),
diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index 31a507cb3dc92..57c62bf486df9 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -95,7 +95,7 @@ struct ObjectInUseEntry {
   PlasmaObject object;
   /// A flag representing whether the object has been sealed.
   bool is_sealed;
-  bool is_shared = false;
+  bool is_mutable = false;
   /// For shared objects only.
   /// The last version that we read or wrote. To read or write again, we must
   /// pass a newer version than this.
@@ -119,6 +119,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
 
   Status CreateAndSpillIfNeeded(const ObjectID &object_id,
                                 const ray::rpc::Address &owner_address,
+                                bool is_mutable,
                                 int64_t data_size,
                                 const uint8_t *metadata,
                                 int64_t metadata_size,
@@ -141,6 +142,13 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
                               fb::ObjectSource source,
                               int device_num);
 
+  Status WriteAcquireMutableObject(const ObjectID &object_id,
+                                   int64_t data_size,
+                                   const uint8_t *metadata,
+                                   int64_t metadata_size,
+                                   int64_t num_readers,
+                                   std::shared_ptr<Buffer> *data);
+
   Status Get(const std::vector<ObjectID> &object_ids,
              int64_t timeout_ms,
              std::vector<ObjectBuffer> *object_buffers,
@@ -160,7 +168,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
 
   Status Abort(const ObjectID &object_id);
 
-  Status Seal(const ObjectID &object_id, int64_t num_readers);
+  Status Seal(const ObjectID &object_id);
 
   Status Delete(const std::vector<ObjectID> &object_ids);
 
@@ -371,6 +379,7 @@ Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id,
   // client is using. A call to PlasmaClient::Release is required to decrement
   // this count. Cache the reference to the object.
   IncrementObjectCount(object_id, &object, false);
+  // TODO(swang): Remove the second increment call.
   // We increment the count a second time (and the corresponding decrement will
   // happen in a PlasmaClient::Release call in plasma_seal) so even if the
   // buffer returned by PlasmaClient::Create goes out of scope, the object does
@@ -379,8 +388,57 @@ Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id,
   return Status::OK();
 }
 
+Status PlasmaClient::Impl::WriteAcquireMutableObject(const ObjectID &object_id,
+                                                     int64_t data_size,
+                                                     const uint8_t *metadata,
+                                                     int64_t metadata_size,
+                                                     int64_t num_readers,
+                                                     std::shared_ptr<Buffer> *data) {
+  std::unique_lock<std::recursive_mutex> guard(client_mutex_);
+  auto object_entry = objects_in_use_.find(object_id);
+  RAY_CHECK(object_entry != objects_in_use_.end());
+
+  auto &entry = object_entry->second;
+  RAY_CHECK(entry->is_mutable);
+  RAY_CHECK(entry->is_sealed) << "Must Seal before writing again to a mutable object";
+
+  RAY_LOG(DEBUG) << "Write mutable object " << object_id;
+
+  // Wait for no readers.
+  auto plasma_header = GetPlasmaObjectHeader(entry->object);
+  // NOTE: entry->object.data_size is the size of the data buffer.
+  // When the object is shared, we can have object size smaller than the data buffer.
+  // TODO(swang): Better exception.
+  // TODO(swang): Support data size larger than allocated buffer.
+  RAY_CHECK(data_size <= entry->object.data_size)
+      << "Cannot write mutable data size " << data_size
+      << " larger than allocated buffer size " << entry->object.data_size;
+  // TODO(swang): Support different metadata size.
+  RAY_CHECK(metadata_size == entry->object.metadata_size)
+      << "Metadata size must stay the same";
+  plasma_header->WriteAcquire(entry->next_version_to_write, data_size);
+  plasma_header->num_readers = num_readers;
+
+  // Prepare the data buffer and return to the client instead of sending
+  // the IPC to object store.
+  *data = std::make_shared<PlasmaMutableBuffer>(
+      shared_from_this(),
+      GetStoreFdAndMmap(entry->object.store_fd, entry->object.mmap_size) +
+          entry->object.data_offset,
+      data_size);
+  if (metadata != NULL) {
+    // Copy the metadata to the buffer.
+    memcpy(
+        (*data)->Data() + entry->object.data_size, metadata, entry->object.metadata_size);
+  }
+
+  entry->is_sealed = false;
+  return Status::OK();
+}
+
 Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
                                                   const ray::rpc::Address &owner_address,
+                                                  bool is_mutable,
                                                   int64_t data_size,
                                                   const uint8_t *metadata,
                                                   int64_t metadata_size,
@@ -388,44 +446,6 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
                                                   fb::ObjectSource source,
                                                   int device_num) {
   std::unique_lock<std::recursive_mutex> guard(client_mutex_);
-  auto object_entry = objects_in_use_.find(object_id);
-  if (object_entry != objects_in_use_.end()) {
-    auto &entry = object_entry->second;
-    if (entry->is_sealed && entry->is_shared) {
-      RAY_LOG(DEBUG) << "Create shared object " << object_id << " exists";
-      // Wait for no readers.
-      auto plasma_header = GetPlasmaObjectHeader(entry->object);
-      // TODO(sang)
-      // NOTE: entry->object.data_size is the size of the data buffer.
-      // When the object is shared, we can have object size smaller than the data buffer.
-      RAY_LOG(DEBUG) << "SANG-TODO Update the data size of " << object_id
-                     << ". Size: " << data_size;
-      auto next_version_to_write = plasma_header->version + 1;
-      plasma_header->WriteAcquire(next_version_to_write, data_size);
-
-      // Prepare the data buffer and return to the client instead of sending
-      // the IPC to object store.
-      *data = std::make_shared<PlasmaMutableBuffer>(
-          shared_from_this(),
-          GetStoreFdAndMmap(entry->object.store_fd, entry->object.mmap_size) +
-              entry->object.data_offset,
-          entry->object.data_size);
-      // If plasma_create is being called from a transfer, then we will not copy the
-      // metadata here. The metadata will be written along with the data streamed
-      // from the transfer.
-      if (metadata != NULL) {
-        // Copy the metadata to the buffer.
-        memcpy((*data)->Data() + entry->object.data_size,
-               metadata,
-               entry->object.metadata_size);
-      }
-
-      entry->is_sealed = false;
-      IncrementObjectCount(object_id, &entry->object, false);
-    }
-    return Status::OK();
-  }
-
   uint64_t retry_with_request_id = 0;
 
   RAY_LOG(DEBUG) << "called plasma_create on conn " << store_conn_ << " with size "
@@ -433,6 +453,7 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
   RAY_RETURN_NOT_OK(SendCreateRequest(store_conn_,
                                       object_id,
                                       owner_address,
+                                      is_mutable,
                                       data_size,
                                       metadata_size,
                                       source,
@@ -454,16 +475,28 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
 
   if (status.ok()) {
     // Create IPC was successful.
-    object_entry = objects_in_use_.find(object_id);
+    auto object_entry = objects_in_use_.find(object_id);
     RAY_CHECK(object_entry != objects_in_use_.end());
     auto &entry = object_entry->second;
     RAY_CHECK(!entry->is_sealed);
+    entry->is_mutable = is_mutable;
+
     auto plasma_header = GetPlasmaObjectHeader(entry->object);
     // The corresponding WriteRelease takes place in Seal.
     // When an object is first created, the data size is equivalent to
     // buffer size.
     // The first creation's version is always 1.
-    plasma_header->WriteAcquire(/*next_version_to_write*/ 1, entry->object.data_size);
+    RAY_CHECK(entry->next_version_to_write == 1);
+    plasma_header->WriteAcquire(/*next_version_to_write*/ entry->next_version_to_write,
+                                entry->object.data_size);
+    if (entry->is_mutable) {
+      // The plasma store is the first reader. Once it read-releases, the
+      // writer may write an actual value.
+      plasma_header->num_readers = 1;
+    } else {
+      // Anyone may read.
+      plasma_header->num_readers = -1;
+    }
   }
 
   return status;
@@ -494,6 +527,7 @@ Status PlasmaClient::Impl::TryCreateImmediately(const ObjectID &object_id,
   RAY_RETURN_NOT_OK(SendCreateRequest(store_conn_,
                                       object_id,
                                       owner_address,
+                                      /*is_mutable=*/false,
                                       data_size,
                                       metadata_size,
                                       source,
@@ -537,7 +571,7 @@ Status PlasmaClient::Impl::GetBuffers(
       auto data_size = plasma_header->GetDataSize();
       RAY_LOG(DEBUG) << "SANG-TODO data size is " << data_size;
       if (version_read > 0) {
-        object_entry->second->is_shared = true;
+        object_entry->second->is_mutable = true;
         object_entry->second->next_version_to_read = version_read;
       }
 
@@ -616,7 +650,7 @@ Status PlasmaClient::Impl::GetBuffers(
       int64_t version_read = plasma_header->ReadAcquire(/*version=*/1);
       auto data_size = plasma_header->GetDataSize();
       if (version_read > 0) {
-        object_entry->is_shared = true;
+        object_entry->is_mutable = true;
         object_entry->next_version_to_read = version_read;
       }
 
@@ -670,9 +704,13 @@ Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) {
   }
 
   auto &entry = object_entry->second;
-  //  RAY_CHECK(entry->is_sealed && entry->is_shared) << "ray.release must be called on "
-  //    "objects that are sealed and shared. sealed? " << entry->is_sealed
-  //    << " shared " << entry->is_shared;
+  if (!entry->is_sealed) {
+    return Status::ObjectNotFound("ray.release() called on an object that is not sealed");
+  }
+  if (!entry->is_mutable) {
+    return Status::ObjectNotFound(
+        "ray.release() called on an object that is not mutable");
+  }
 
   RAY_LOG(DEBUG) << "Release shared object " << object_id;
   auto plasma_header = GetPlasmaObjectHeader(entry->object);
@@ -707,7 +745,7 @@ Status PlasmaClient::Impl::Release(const ObjectID &object_id) {
   RAY_CHECK(object_entry->second->count >= 0);
   // Check if the client is no longer using this object.
   // TODO(swang): Nicer way to pin shared objects.
-  if (object_entry->second->count == 0 && !object_entry->second->is_shared) {
+  if (object_entry->second->count == 0 && !object_entry->second->is_mutable) {
     // object_entry is invalidated in MarkObjectUnused, need to read the fd beforehand.
     MEMFD_TYPE fd = object_entry->second->object.store_fd;
     // Tell the store that the client no longer needs the object.
@@ -763,7 +801,7 @@ Status PlasmaClient::Impl::Contains(const ObjectID &object_id, bool *has_object)
   return Status::OK();
 }
 
-Status PlasmaClient::Impl::Seal(const ObjectID &object_id, int64_t num_readers) {
+Status PlasmaClient::Impl::Seal(const ObjectID &object_id) {
   std::lock_guard<std::recursive_mutex> guard(client_mutex_);
 
   // Make sure this client has a reference to the object before sending the
@@ -778,15 +816,10 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id, int64_t num_readers)
   }
 
   auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object);
-  // The value should've already updated when object is created.
-  auto next_version_to_write = plasma_header->version;
   plasma_header->WriteRelease(
-      /*write_version=*/next_version_to_write, num_readers);
-  object_entry->second->next_version_to_write = next_version_to_write;
-
-  if (num_readers != -1) {
-    object_entry->second->is_shared = true;
-  }
+      /*write_version=*/object_entry->second->next_version_to_write);
+  // The next Write must pass a higher version.
+  object_entry->second->next_version_to_write++;
   object_entry->second->is_sealed = true;
   //// Send the seal request to Plasma.
   // RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id));
@@ -932,8 +965,19 @@ Status PlasmaClient::Connect(const std::string &store_socket_name,
       store_socket_name, manager_socket_name, release_delay, num_retries);
 }
 
+Status PlasmaClient::WriteAcquireMutableObject(const ObjectID &object_id,
+                                               int64_t data_size,
+                                               const uint8_t *metadata,
+                                               int64_t metadata_size,
+                                               int64_t num_readers,
+                                               std::shared_ptr<Buffer> *data) {
+  return impl_->WriteAcquireMutableObject(
+      object_id, data_size, metadata, metadata_size, num_readers, data);
+}
+
 Status PlasmaClient::CreateAndSpillIfNeeded(const ObjectID &object_id,
                                             const ray::rpc::Address &owner_address,
+                                            bool is_mutable,
                                             int64_t data_size,
                                             const uint8_t *metadata,
                                             int64_t metadata_size,
@@ -942,6 +986,7 @@ Status PlasmaClient::CreateAndSpillIfNeeded(const ObjectID &object_id,
                                             int device_num) {
   return impl_->CreateAndSpillIfNeeded(object_id,
                                        owner_address,
+                                       is_mutable,
                                        data_size,
                                        metadata,
                                        metadata_size,
@@ -989,9 +1034,7 @@ Status PlasmaClient::Contains(const ObjectID &object_id, bool *has_object) {
 
 Status PlasmaClient::Abort(const ObjectID &object_id) { return impl_->Abort(object_id); }
 
-Status PlasmaClient::Seal(const ObjectID &object_id, int64_t num_readers) {
-  return impl_->Seal(object_id, num_readers);
-}
+Status PlasmaClient::Seal(const ObjectID &object_id) { return impl_->Seal(object_id); }
 
 Status PlasmaClient::Delete(const ObjectID &object_id) {
   return impl_->Delete(std::vector<ObjectID>{object_id});
diff --git a/src/ray/object_manager/plasma/client.h b/src/ray/object_manager/plasma/client.h
index 74841df373fee..00c85cca3f11e 100644
--- a/src/ray/object_manager/plasma/client.h
+++ b/src/ray/object_manager/plasma/client.h
@@ -90,7 +90,7 @@ class PlasmaClientInterface {
   ///
   /// \param object_id The ID of the object to seal.
   /// \return The return status.
-  virtual Status Seal(const ObjectID &object_id, int64_t num_readers = -1) = 0;
+  virtual Status Seal(const ObjectID &object_id) = 0;
 
   /// Abort an unsealed object in the object store. If the abort succeeds, then
   /// it will be as if the object was never created at all. The unsealed object
@@ -129,6 +129,7 @@ class PlasmaClientInterface {
   /// be either sealed or aborted.
   virtual Status CreateAndSpillIfNeeded(const ObjectID &object_id,
                                         const ray::rpc::Address &owner_address,
+                                        bool is_mutable,
                                         int64_t data_size,
                                         const uint8_t *metadata,
                                         int64_t metadata_size,
@@ -136,6 +137,13 @@ class PlasmaClientInterface {
                                         plasma::flatbuf::ObjectSource source,
                                         int device_num = 0) = 0;
 
+  virtual Status WriteAcquireMutableObject(const ObjectID &object_id,
+                                           int64_t data_size,
+                                           const uint8_t *metadata,
+                                           int64_t metadata_size,
+                                           int64_t num_readers,
+                                           std::shared_ptr<Buffer> *data) = 0;
+
   /// Delete a list of objects from the object store. This currently assumes that the
   /// object is present, has been sealed and not used by another client. Otherwise,
   /// it is a no operation.
@@ -195,6 +203,7 @@ class PlasmaClient : public PlasmaClientInterface {
   /// be either sealed or aborted.
   Status CreateAndSpillIfNeeded(const ObjectID &object_id,
                                 const ray::rpc::Address &owner_address,
+                                bool is_mutable,
                                 int64_t data_size,
                                 const uint8_t *metadata,
                                 int64_t metadata_size,
@@ -202,6 +211,13 @@ class PlasmaClient : public PlasmaClientInterface {
                                 plasma::flatbuf::ObjectSource source,
                                 int device_num = 0);
 
+  Status WriteAcquireMutableObject(const ObjectID &object_id,
+                                   int64_t data_size,
+                                   const uint8_t *metadata,
+                                   int64_t metadata_size,
+                                   int64_t num_readers,
+                                   std::shared_ptr<Buffer> *data);
+
   /// Create an object in the Plasma Store. Any metadata for this object must be
   /// be passed in when the object is created.
   ///
@@ -294,7 +310,7 @@ class PlasmaClient : public PlasmaClientInterface {
   ///
   /// \param object_id The ID of the object to seal.
   /// \return The return status.
-  Status Seal(const ObjectID &object_id, int64_t num_readers = -1);
+  Status Seal(const ObjectID &object_id);
 
   /// Delete an object from the object store. This currently assumes that the
   /// object is present, has been sealed and not used by another client. Otherwise,
diff --git a/src/ray/object_manager/plasma/object_store.cc b/src/ray/object_manager/plasma/object_store.cc
index 260951f178567..7d60c3ff1394a 100644
--- a/src/ray/object_manager/plasma/object_store.cc
+++ b/src/ray/object_manager/plasma/object_store.cc
@@ -70,6 +70,13 @@ const LocalObject *ObjectStore::SealObject(const ObjectID &object_id) {
   }
   entry->state = ObjectState::PLASMA_SEALED;
   entry->construct_duration = std::time(nullptr) - entry->create_time;
+  auto plasma_header = entry->GetPlasmaObjectHeader();
+  if (entry->object_info.is_mutable) {
+    // Register the sealed object before allowing the writer to write.
+    plasma_header->ReadRelease(/*read_version=*/1);
+  } else {
+    RAY_CHECK(plasma_header->num_readers == -1) << plasma_header->num_readers;
+  }
   return entry;
 }
 
diff --git a/src/ray/object_manager/plasma/plasma.fbs b/src/ray/object_manager/plasma/plasma.fbs
index 68177465f3366..ba2df089c6032 100644
--- a/src/ray/object_manager/plasma/plasma.fbs
+++ b/src/ray/object_manager/plasma/plasma.fbs
@@ -129,6 +129,8 @@ table PlasmaCreateRequest {
   owner_port: int;
   // Unique id for the owner worker.
   owner_worker_id: string;
+  // Whether the object will be mutable.
+  is_mutable: bool;
   // The size of the object's data in bytes.
   data_size: ulong;
   // The size of the object's metadata in bytes.
diff --git a/src/ray/object_manager/plasma/protocol.cc b/src/ray/object_manager/plasma/protocol.cc
index 79b9a27827fb1..50f1f60d332ec 100644
--- a/src/ray/object_manager/plasma/protocol.cc
+++ b/src/ray/object_manager/plasma/protocol.cc
@@ -200,6 +200,7 @@ Status SendCreateRetryRequest(const std::shared_ptr<StoreConn> &store_conn,
 Status SendCreateRequest(const std::shared_ptr<StoreConn> &store_conn,
                          ObjectID object_id,
                          const ray::rpc::Address &owner_address,
+                         bool is_mutable,
                          int64_t data_size,
                          int64_t metadata_size,
                          flatbuf::ObjectSource source,
@@ -213,6 +214,7 @@ Status SendCreateRequest(const std::shared_ptr<StoreConn> &store_conn,
                                     fbb.CreateString(owner_address.ip_address()),
                                     owner_address.port(),
                                     fbb.CreateString(owner_address.worker_id()),
+                                    is_mutable,
                                     data_size,
                                     metadata_size,
                                     source,
@@ -229,6 +231,7 @@ void ReadCreateRequest(uint8_t *data,
   RAY_DCHECK(data);
   auto message = flatbuffers::GetRoot<fb::PlasmaCreateRequest>(data);
   RAY_DCHECK(VerifyFlatbuffer(message, data, size));
+  object_info->is_mutable = message->is_mutable();
   object_info->data_size = message->data_size();
   object_info->metadata_size = message->metadata_size();
   object_info->object_id = ObjectID::FromBinary(message->object_id()->str());
diff --git a/src/ray/object_manager/plasma/protocol.h b/src/ray/object_manager/plasma/protocol.h
index 7f4fcdd3ac589..23a120ac2ca05 100644
--- a/src/ray/object_manager/plasma/protocol.h
+++ b/src/ray/object_manager/plasma/protocol.h
@@ -85,6 +85,7 @@ Status SendCreateRetryRequest(const std::shared_ptr<StoreConn> &store_conn,
 Status SendCreateRequest(const std::shared_ptr<StoreConn> &store_conn,
                          ObjectID object_id,
                          const ray::rpc::Address &owner_address,
+                         bool is_mutable,
                          int64_t data_size,
                          int64_t metadata_size,
                          flatbuf::ObjectSource source,
diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc
index e948e885aecfa..c0aa1319b193f 100644
--- a/src/ray/object_manager/plasma/store.cc
+++ b/src/ray/object_manager/plasma/store.cc
@@ -540,50 +540,32 @@ void PlasmaStore::WaitForSeal(const ObjectID &object_id,
   RAY_CHECK(entry);
   auto plasma_header = entry->GetPlasmaObjectHeader();
 
-  int event_fd = eventfd(0, EFD_CLOEXEC);
-  RAY_CHECK(event_fd != -1);
+  auto seal_signal = std::make_shared<boost::asio::deadline_timer>(io_context_);
+  seal_signal->expires_at(boost::posix_time::pos_infin);
 
-  auto wait_fn = [event_fd, plasma_header]() {
+  auto wait_fn = [this, seal_signal, plasma_header]() {
     plasma_header->ReadAcquire(/*read_version=*/1);
 
-    uint64_t data = 1;
-    auto num_bytes_written = write(event_fd, &data, sizeof(data));
-    // TODO(swang): Need proper error checking here.
-    if (num_bytes_written != sizeof(data)) {
-      RAY_LOG(WARNING) << num_bytes_written << " bytes written on fd " << event_fd
-                       << "  err: " << strerror(errno);
+    {
+      absl::MutexLock lock(&seal_deadline_timer_mutex_);
+      seal_signal->cancel();
     }
   };
 
   auto wait_thread = std::make_shared<std::thread>(wait_fn);
 
-  boost::asio::spawn(
-      io_context_,
-      [this, event_fd, object_id, plasma_header, wait_thread, client](
-          boost::asio::yield_context yield) {
-        auto event_stream = std::make_shared<boost::asio::posix::stream_descriptor>(
-            io_context_, event_fd);
-        auto data = std::make_shared<uint64_t>(0);
-        auto buf = boost::asio::buffer(data.get(), sizeof(*data));
-        boost::asio::async_read(
-            *event_stream,
-            buf,
-            [this, event_stream, data, object_id, event_fd, wait_thread](
-                const boost::system::error_code &ec, size_t bytes_transferred) {
-              RAY_CHECK(bytes_transferred == sizeof(*data)) << ec.message();
-
-              // RAY_CHECK(plasma_header->num_readers == -1) <<
-              // plasma_header->num_readers;
-
-              {
-                absl::MutexLock lock(&mutex_);
-                SealObjects({object_id});
-              }
-
-              wait_thread->join();
-              close(event_fd);
-            });
-      });
+  {
+    absl::MutexLock lock(&seal_deadline_timer_mutex_);
+    seal_signal->async_wait([this, object_id, plasma_header, wait_thread, client](
+                                const boost::system::error_code &ec) {
+      {
+        absl::MutexLock lock(&mutex_);
+        SealObjects({object_id});
+      }
+
+      wait_thread->join();
+    });
+  }
 }
 
 int64_t PlasmaStore::GetConsumedBytes() { return total_consumed_bytes_; }
diff --git a/src/ray/object_manager/plasma/store.h b/src/ray/object_manager/plasma/store.h
index 74c33edb9a2c6..3a4b457a6381b 100644
--- a/src/ray/object_manager/plasma/store.h
+++ b/src/ray/object_manager/plasma/store.h
@@ -309,6 +309,8 @@ class PlasmaStore {
   bool dumped_on_oom_ ABSL_GUARDED_BY(mutex_) = false;
 
   GetRequestQueue get_request_queue_ ABSL_GUARDED_BY(mutex_);
+
+  absl::Mutex seal_deadline_timer_mutex_;
 };
 
 }  // namespace plasma

From c2dbf1f8cf8a9e0c444a06af1817f956aaf9a9b5 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 29 Nov 2023 17:09:57 -0800
Subject: [PATCH 07/66] feature flag for shared mem seal, only acquire once per
 ray.get

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 src/ray/common/ray_config_def.h         |   4 +
 src/ray/object_manager/plasma/client.cc | 110 +++++++++++++++---------
 src/ray/object_manager/plasma/store.cc  |   7 +-
 3 files changed, 78 insertions(+), 43 deletions(-)

diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h
index d15d9fbffc7dd..ba6825b066954 100644
--- a/src/ray/common/ray_config_def.h
+++ b/src/ray/common/ray_config_def.h
@@ -886,3 +886,7 @@ RAY_CONFIG(bool, enable_autoscaler_v2, false)
 // Python GCS client number of reconnection retry and timeout.
 RAY_CONFIG(int64_t, nums_py_gcs_reconnect_retry, 5)
 RAY_CONFIG(int64_t, py_gcs_connect_timeout_s, 30)
+
+// Feature flag for whether to use shared-memory based synchronization to
+// implement plasma object Seal. The current method is to instead use IPC.
+RAY_CONFIG(bool, plasma_use_shared_memory_seal, false)
diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index 57c62bf486df9..95f5f8489c5c4 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -97,9 +97,17 @@ struct ObjectInUseEntry {
   bool is_sealed;
   bool is_mutable = false;
   /// For shared objects only.
-  /// The last version that we read or wrote. To read or write again, we must
-  /// pass a newer version than this.
+  /// The last version that we read. To read again, we must pass a newer
+  /// version than this.
   int64_t next_version_to_read = 1;
+  /// Whether we currently have a read lock on the object. If this is true,
+  /// then it is safe to read the value of the object. For immutable objects,
+  /// this will always be true once the object has been sealed. For immutable
+  /// objects, ReadRelease resets this to false, and ReadAcquire resets to
+  /// true.
+  bool read_acquired = false;
+  /// The last version that we wrote. To write again, we must pass a newer
+  /// version than this.
   int64_t next_version_to_write = 1;
 };
 
@@ -160,6 +168,9 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
              ObjectBuffer *object_buffers,
              bool is_from_worker);
 
+  ray::PlasmaObjectHeader *EnsureGetAcquired(
+      std::unique_ptr<ObjectInUseEntry> &object_entry);
+
   Status GetRelease(const ObjectID &object_id);
 
   Status Release(const ObjectID &object_id);
@@ -562,18 +573,10 @@ Status PlasmaClient::Impl::GetBuffers(
           << "Attempting to get an object that this client created but hasn't sealed.";
       all_present = false;
     } else {
-      PlasmaObject *object = &object_entry->second->object;
-
       // Wait for the object to become ready to read.
-      auto plasma_header = GetPlasmaObjectHeader(*object);
-      int64_t version_read =
-          plasma_header->ReadAcquire(object_entry->second->next_version_to_read);
-      auto data_size = plasma_header->GetDataSize();
-      RAY_LOG(DEBUG) << "SANG-TODO data size is " << data_size;
-      if (version_read > 0) {
-        object_entry->second->is_mutable = true;
-        object_entry->second->next_version_to_read = version_read;
-      }
+      auto plasma_header = EnsureGetAcquired(object_entry->second);
+
+      PlasmaObject *object = &object_entry->second->object;
 
       std::shared_ptr<Buffer> physical_buf;
       if (object->device_num == 0) {
@@ -584,6 +587,7 @@ Status PlasmaClient::Impl::GetBuffers(
         RAY_LOG(FATAL) << "GPU library is not enabled.";
       }
       physical_buf = wrap_buffer(object_ids[i], physical_buf);
+      auto data_size = plasma_header->GetDataSize();
       object_buffers[i].data = SharedMemoryBuffer::Slice(physical_buf, 0, data_size);
       object_buffers[i].metadata = SharedMemoryBuffer::Slice(
           physical_buf, object->data_size, object->metadata_size);
@@ -645,15 +649,11 @@ Status PlasmaClient::Impl::GetBuffers(
       // client is using. Cache the reference to the object.
       IncrementObjectCount(received_object_ids[i], object, true);
       auto &object_entry = objects_in_use_[received_object_ids[i]];
+
       // Wait for the object to become ready to read.
-      auto plasma_header = GetPlasmaObjectHeader(*object);
-      int64_t version_read = plasma_header->ReadAcquire(/*version=*/1);
+      RAY_CHECK(!object_entry->read_acquired);
+      auto plasma_header = EnsureGetAcquired(object_entry);
       auto data_size = plasma_header->GetDataSize();
-      if (version_read > 0) {
-        object_entry->is_mutable = true;
-        object_entry->next_version_to_read = version_read;
-      }
-
       std::shared_ptr<Buffer> physical_buf;
       if (object->device_num == 0) {
         uint8_t *data = LookupMmappedFile(object->store_fd);
@@ -694,9 +694,27 @@ Status PlasmaClient::Impl::Get(const std::vector<ObjectID> &object_ids,
       &object_ids[0], num_objects, timeout_ms, wrap_buffer, &(*out)[0], is_from_worker);
 }
 
+ray::PlasmaObjectHeader *PlasmaClient::Impl::EnsureGetAcquired(
+    std::unique_ptr<ObjectInUseEntry> &object_entry) {
+  PlasmaObject *object = &object_entry->object;
+  auto plasma_header = GetPlasmaObjectHeader(*object);
+  if (object_entry->read_acquired) {
+    return plasma_header;
+  }
+
+  int64_t version_read = plasma_header->ReadAcquire(object_entry->next_version_to_read);
+  object_entry->read_acquired = true;
+  if (version_read > 0) {
+    object_entry->is_mutable = true;
+    object_entry->next_version_to_read = version_read;
+  }
+  return plasma_header;
+}
+
 Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) {
   RAY_LOG(DEBUG) << "Try to release Get for object " << object_id;
   std::unique_lock<std::recursive_mutex> guard(client_mutex_);
+
   auto object_entry = objects_in_use_.find(object_id);
   if (object_entry == objects_in_use_.end()) {
     return Status::ObjectNotFound(
@@ -712,8 +730,8 @@ Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) {
         "ray.release() called on an object that is not mutable");
   }
 
+  auto plasma_header = EnsureGetAcquired(entry);
   RAY_LOG(DEBUG) << "Release shared object " << object_id;
-  auto plasma_header = GetPlasmaObjectHeader(entry->object);
   plasma_header->ReadRelease(entry->next_version_to_read);
   // The next read needs to read at least this version.
   entry->next_version_to_read++;
@@ -741,11 +759,14 @@ Status PlasmaClient::Impl::Release(const ObjectID &object_id) {
   const auto object_entry = objects_in_use_.find(object_id);
   RAY_CHECK(object_entry != objects_in_use_.end());
 
-  object_entry->second->count -= 1;
-  RAY_CHECK(object_entry->second->count >= 0);
-  // Check if the client is no longer using this object.
-  // TODO(swang): Nicer way to pin shared objects.
-  if (object_entry->second->count == 0 && !object_entry->second->is_mutable) {
+  if (!object_entry->second->is_mutable) {
+    // Release only applies to immutable objects.
+    // TODO(swang): Add a delete call to properly clean up mutable objects.
+    object_entry->second->count -= 1;
+    RAY_CHECK(object_entry->second->count >= 0);
+  }
+
+  if (object_entry->second->count == 0) {
     // object_entry is invalidated in MarkObjectUnused, need to read the fd beforehand.
     MEMFD_TYPE fd = object_entry->second->object.store_fd;
     // Tell the store that the client no longer needs the object.
@@ -821,22 +842,27 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) {
   // The next Write must pass a higher version.
   object_entry->second->next_version_to_write++;
   object_entry->second->is_sealed = true;
-  //// Send the seal request to Plasma.
-  // RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id));
-  // std::vector<uint8_t> buffer;
-  // RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer));
-  // ObjectID sealed_id;
-  // RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id));
-  // RAY_CHECK(sealed_id == object_id);
-  //// We call PlasmaClient::Release to decrement the number of instances of this
-  //// object
-  //// that are currently being used by this client. The corresponding increment
-  //// happened in plasma_create and was used to ensure that the object was not
-  //// released before the call to PlasmaClient::Seal.
-  // return Release(object_id);
-
-  // TODO(swang): Release the object if the ref count == 0.
-  return Status::OK();
+
+  if (RayConfig::instance().plasma_use_shared_memory_seal()) {
+    // If using shared-memory based Seal, then we don't need to do anything
+    // further because the object store will learn that the object has been
+    // sealed when ReadAcquire returns.
+    return Status::OK();
+  }
+
+  /// Send the seal request to Plasma.
+  RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id));
+  std::vector<uint8_t> buffer;
+  RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer));
+  ObjectID sealed_id;
+  RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id));
+  RAY_CHECK(sealed_id == object_id);
+  // We call PlasmaClient::Release to decrement the number of instances of this
+  // object
+  // that are currently being used by this client. The corresponding increment
+  // happened in plasma_create and was used to ensure that the object was not
+  // released before the call to PlasmaClient::Seal.
+  return Release(object_id);
 }
 
 Status PlasmaClient::Impl::Abort(const ObjectID &object_id) {
diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc
index c0aa1319b193f..c6b80160511c8 100644
--- a/src/ray/object_manager/plasma/store.cc
+++ b/src/ray/object_manager/plasma/store.cc
@@ -528,7 +528,9 @@ void PlasmaStore::ReplyToCreateClient(const std::shared_ptr<Client> &client,
       static_cast<void>(client->SendFd(result.store_fd));
     }
 
-    WaitForSeal(object_id, client);
+    if (RayConfig::instance().plasma_use_shared_memory_seal()) {
+      WaitForSeal(object_id, client);
+    }
   } else {
     static_cast<void>(SendUnfinishedCreateReply(client, object_id, req_id));
   }
@@ -540,6 +542,9 @@ void PlasmaStore::WaitForSeal(const ObjectID &object_id,
   RAY_CHECK(entry);
   auto plasma_header = entry->GetPlasmaObjectHeader();
 
+  // Read acquire is blocking, so put it on a background thread and use an
+  // async timer as a signal. The main thread is signaled when the timer is
+  // cancelled.
   auto seal_signal = std::make_shared<boost::asio::deadline_timer>(io_context_);
   seal_signal->expires_at(boost::posix_time::pos_infin);
 

From 6d4aa943ebd8b77429a4e54857e4012ef40ab2c3 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 29 Nov 2023 17:35:00 -0800
Subject: [PATCH 08/66] put-get

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/__init__.py                   |  2 +-
 python/ray/_private/worker.py            |  5 ++++-
 python/ray/tests/test_accelerated_dag.py | 16 ++++++++++++----
 src/ray/object_manager/plasma/client.cc  |  1 +
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/python/ray/__init__.py b/python/ray/__init__.py
index d95d2f5a20c5b..fd32de31e5463 100644
--- a/python/ray/__init__.py
+++ b/python/ray/__init__.py
@@ -116,11 +116,11 @@ def _configure_system():
     SPILL_WORKER_MODE,
     _create_mutable_object,
     _put_mutable_object,
+    _release_mutable_object,
     cancel,
     get,
     get_actor,
     get_gpu_ids,
-    release,
     init,
     is_initialized,
     put,
diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py
index 4e5b08f6dedb1..31c8a9fae2276 100644
--- a/python/ray/_private/worker.py
+++ b/python/ray/_private/worker.py
@@ -2498,7 +2498,7 @@ def show_in_dashboard(message: str, key: str = "", dtype: str = "text"):
 blocking_get_inside_async_warned = False
 
 
-def release(object_ref):
+def _release_mutable_object(object_ref):
     worker = global_worker
     worker.check_connected()
     worker.core_worker.get_release([object_ref])
@@ -2640,6 +2640,9 @@ def _put_mutable_object(value: Any, object_ref: ObjectRef, num_readers: int):
     worker = global_worker
     worker.check_connected()
 
+    if num_readers <= 0:
+        raise ValueError("``num_readers`` must be a positive integer.")
+
     try:
         serialized_value = worker.get_serialization_context().serialize(value)
     except TypeError as e:
diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py
index 405d78d46cc9b..4622378349acc 100644
--- a/python/ray/tests/test_accelerated_dag.py
+++ b/python/ray/tests/test_accelerated_dag.py
@@ -11,11 +11,19 @@
 logger = logging.getLogger(__name__)
 
 
-def test_put_mutable_object(ray_start_cluster):
-    ray.init()
+def test_put_get(ray_start_cluster):
+    ray.init(
+        _system_config={
+            "plasma_use_shared_memory_seal": True,
+        }
+    )
     ref = ray._create_mutable_object(1000)
-    ray._put_mutable_object(b"hello", ref, num_readers=1)
-    assert ray.get(ref) == b"hello"
+
+    for i in range(100):
+        val = i.to_bytes(8, "little")
+        ray._put_mutable_object(val, ref, num_readers=1)
+        assert ray.get(ref) == val
+        ray._release_mutable_object(ref)
 
 
 if __name__ == "__main__":
diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index 95f5f8489c5c4..f8148f01eaf84 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -735,6 +735,7 @@ Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) {
   plasma_header->ReadRelease(entry->next_version_to_read);
   // The next read needs to read at least this version.
   entry->next_version_to_read++;
+  entry->read_acquired = false;
 
   return Status::OK();
 }

From bc4f1e9b98b7a8d757725836dfa4b3bba81d5562 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 29 Nov 2023 17:44:55 -0800
Subject: [PATCH 09/66] rm shared mem seal

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/tests/test_accelerated_dag.py      |  7 +---
 src/ray/common/ray_config_def.h               |  4 --
 src/ray/object_manager/common.cc              |  1 -
 src/ray/object_manager/plasma/client.cc       | 12 +-----
 src/ray/object_manager/plasma/object_store.cc |  5 +--
 src/ray/object_manager/plasma/store.cc        | 41 -------------------
 src/ray/object_manager/plasma/store.h         |  4 --
 7 files changed, 5 insertions(+), 69 deletions(-)

diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py
index 4622378349acc..ebc915255c7e7 100644
--- a/python/ray/tests/test_accelerated_dag.py
+++ b/python/ray/tests/test_accelerated_dag.py
@@ -12,11 +12,8 @@
 
 
 def test_put_get(ray_start_cluster):
-    ray.init(
-        _system_config={
-            "plasma_use_shared_memory_seal": True,
-        }
-    )
+    ray.init()
+
     ref = ray._create_mutable_object(1000)
 
     for i in range(100):
diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h
index ba6825b066954..d15d9fbffc7dd 100644
--- a/src/ray/common/ray_config_def.h
+++ b/src/ray/common/ray_config_def.h
@@ -886,7 +886,3 @@ RAY_CONFIG(bool, enable_autoscaler_v2, false)
 // Python GCS client number of reconnection retry and timeout.
 RAY_CONFIG(int64_t, nums_py_gcs_reconnect_retry, 5)
 RAY_CONFIG(int64_t, py_gcs_connect_timeout_s, 30)
-
-// Feature flag for whether to use shared-memory based synchronization to
-// implement plasma object Seal. The current method is to instead use IPC.
-RAY_CONFIG(bool, plasma_use_shared_memory_seal, false)
diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc
index 24e906d9ba4d5..34194f249d234 100644
--- a/src/ray/object_manager/common.cc
+++ b/src/ray/object_manager/common.cc
@@ -78,7 +78,6 @@ void PlasmaObjectHeader::WriteRelease(int64_t write_version) {
       << version << ". Are you sure this is the only writer?";
 
   version = write_version;
-  RAY_CHECK(num_readers != 0);
   num_read_acquires_remaining = num_readers;
   num_read_releases_remaining = num_readers;
 
diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index f8148f01eaf84..46f6d4ac286f2 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -501,9 +501,8 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
     plasma_header->WriteAcquire(/*next_version_to_write*/ entry->next_version_to_write,
                                 entry->object.data_size);
     if (entry->is_mutable) {
-      // The plasma store is the first reader. Once it read-releases, the
-      // writer may write an actual value.
-      plasma_header->num_readers = 1;
+      // When the object is first created, it is in writeable state.
+      plasma_header->num_readers = 0;
     } else {
       // Anyone may read.
       plasma_header->num_readers = -1;
@@ -844,13 +843,6 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) {
   object_entry->second->next_version_to_write++;
   object_entry->second->is_sealed = true;
 
-  if (RayConfig::instance().plasma_use_shared_memory_seal()) {
-    // If using shared-memory based Seal, then we don't need to do anything
-    // further because the object store will learn that the object has been
-    // sealed when ReadAcquire returns.
-    return Status::OK();
-  }
-
   /// Send the seal request to Plasma.
   RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id));
   std::vector<uint8_t> buffer;
diff --git a/src/ray/object_manager/plasma/object_store.cc b/src/ray/object_manager/plasma/object_store.cc
index 7d60c3ff1394a..8f4178dc9b797 100644
--- a/src/ray/object_manager/plasma/object_store.cc
+++ b/src/ray/object_manager/plasma/object_store.cc
@@ -71,10 +71,7 @@ const LocalObject *ObjectStore::SealObject(const ObjectID &object_id) {
   entry->state = ObjectState::PLASMA_SEALED;
   entry->construct_duration = std::time(nullptr) - entry->create_time;
   auto plasma_header = entry->GetPlasmaObjectHeader();
-  if (entry->object_info.is_mutable) {
-    // Register the sealed object before allowing the writer to write.
-    plasma_header->ReadRelease(/*read_version=*/1);
-  } else {
+  if (!entry->object_info.is_mutable) {
     RAY_CHECK(plasma_header->num_readers == -1) << plasma_header->num_readers;
   }
   return entry;
diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc
index c6b80160511c8..2f96cf139d5b9 100644
--- a/src/ray/object_manager/plasma/store.cc
+++ b/src/ray/object_manager/plasma/store.cc
@@ -527,52 +527,11 @@ void PlasmaStore::ReplyToCreateClient(const std::shared_ptr<Client> &client,
         error == PlasmaError::OK && result.device_num == 0) {
       static_cast<void>(client->SendFd(result.store_fd));
     }
-
-    if (RayConfig::instance().plasma_use_shared_memory_seal()) {
-      WaitForSeal(object_id, client);
-    }
   } else {
     static_cast<void>(SendUnfinishedCreateReply(client, object_id, req_id));
   }
 }
 
-void PlasmaStore::WaitForSeal(const ObjectID &object_id,
-                              const std::shared_ptr<Client> &client) {
-  auto entry = object_lifecycle_mgr_.GetObject(object_id);
-  RAY_CHECK(entry);
-  auto plasma_header = entry->GetPlasmaObjectHeader();
-
-  // Read acquire is blocking, so put it on a background thread and use an
-  // async timer as a signal. The main thread is signaled when the timer is
-  // cancelled.
-  auto seal_signal = std::make_shared<boost::asio::deadline_timer>(io_context_);
-  seal_signal->expires_at(boost::posix_time::pos_infin);
-
-  auto wait_fn = [this, seal_signal, plasma_header]() {
-    plasma_header->ReadAcquire(/*read_version=*/1);
-
-    {
-      absl::MutexLock lock(&seal_deadline_timer_mutex_);
-      seal_signal->cancel();
-    }
-  };
-
-  auto wait_thread = std::make_shared<std::thread>(wait_fn);
-
-  {
-    absl::MutexLock lock(&seal_deadline_timer_mutex_);
-    seal_signal->async_wait([this, object_id, plasma_header, wait_thread, client](
-                                const boost::system::error_code &ec) {
-      {
-        absl::MutexLock lock(&mutex_);
-        SealObjects({object_id});
-      }
-
-      wait_thread->join();
-    });
-  }
-}
-
 int64_t PlasmaStore::GetConsumedBytes() { return total_consumed_bytes_; }
 
 bool PlasmaStore::IsObjectSpillable(const ObjectID &object_id) {
diff --git a/src/ray/object_manager/plasma/store.h b/src/ray/object_manager/plasma/store.h
index 3a4b457a6381b..a6c992c131280 100644
--- a/src/ray/object_manager/plasma/store.h
+++ b/src/ray/object_manager/plasma/store.h
@@ -118,8 +118,6 @@ class PlasmaStore {
     return available;
   }
 
-  void WaitForSeal(const ObjectID &object_id, const std::shared_ptr<Client> &client);
-
  private:
   /// Create a new object. The client must do a call to release_object to tell
   /// the store when it is done with the object.
@@ -309,8 +307,6 @@ class PlasmaStore {
   bool dumped_on_oom_ ABSL_GUARDED_BY(mutex_) = false;
 
   GetRequestQueue get_request_queue_ ABSL_GUARDED_BY(mutex_);
-
-  absl::Mutex seal_deadline_timer_mutex_;
 };
 
 }  // namespace plasma

From c4a2378baf57535a988f4b389155b5266d9f3fba Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Thu, 30 Nov 2023 08:47:15 -0800
Subject: [PATCH 10/66] fix num_readers on first version, unit tests pass now

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/_private/worker.py            |  6 ++-
 python/ray/tests/test_accelerated_dag.py | 32 +++++++++++--
 src/ray/object_manager/common.cc         |  1 +
 src/ray/object_manager/plasma/client.cc  | 59 ++++++++++++------------
 src/ray/object_manager/plasma/store.cc   |  2 -
 5 files changed, 63 insertions(+), 37 deletions(-)

diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py
index 31c8a9fae2276..d9ed7a72e94c5 100644
--- a/python/ray/_private/worker.py
+++ b/python/ray/_private/worker.py
@@ -2498,10 +2498,12 @@ def show_in_dashboard(message: str, key: str = "", dtype: str = "text"):
 blocking_get_inside_async_warned = False
 
 
-def _release_mutable_object(object_ref):
+def _release_mutable_object(object_refs):
     worker = global_worker
     worker.check_connected()
-    worker.core_worker.get_release([object_ref])
+    if isinstance(object_refs, ObjectRef):
+        object_refs = [object_refs]
+    worker.core_worker.get_release(object_refs)
 
 
 @overload
diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py
index ebc915255c7e7..4adc04bb1e51b 100644
--- a/python/ray/tests/test_accelerated_dag.py
+++ b/python/ray/tests/test_accelerated_dag.py
@@ -11,18 +11,42 @@
 logger = logging.getLogger(__name__)
 
 
-def test_put_get(ray_start_cluster):
-    ray.init()
-
+def test_put_local_get(ray_start_regular):
     ref = ray._create_mutable_object(1000)
 
-    for i in range(100):
+    num_writes = 1000
+    for i in range(num_writes):
         val = i.to_bytes(8, "little")
         ray._put_mutable_object(val, ref, num_readers=1)
         assert ray.get(ref) == val
         ray._release_mutable_object(ref)
 
 
+@pytest.mark.parametrize("num_readers", [1, 4])
+def test_put_remote_get(ray_start_regular, num_readers):
+    ref = ray._create_mutable_object(1000)
+
+    @ray.remote(num_cpus=0)
+    class Reader:
+        def __init__(self):
+            pass
+
+        def read(self, ref, num_writes):
+            for i in range(num_writes):
+                val = i.to_bytes(8, "little")
+                assert ray.get(ref[0]) == val
+                ray._release_mutable_object(ref)
+
+    num_writes = 1000
+    readers = [Reader.remote() for _ in range(num_readers)]
+    done = [reader.read.remote([ref], num_writes) for reader in readers]
+    for i in range(num_writes):
+        val = i.to_bytes(8, "little")
+        ray._put_mutable_object(val, ref, num_readers=num_readers)
+
+    ray.get(done)
+
+
 if __name__ == "__main__":
     if os.environ.get("PARALLEL_CI"):
         sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))
diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc
index 34194f249d234..bd1cc168481c8 100644
--- a/src/ray/object_manager/common.cc
+++ b/src/ray/object_manager/common.cc
@@ -78,6 +78,7 @@ void PlasmaObjectHeader::WriteRelease(int64_t write_version) {
       << version << ". Are you sure this is the only writer?";
 
   version = write_version;
+  RAY_CHECK(num_readers != 0) << num_readers;
   num_read_acquires_remaining = num_readers;
   num_read_releases_remaining = num_readers;
 
diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index 46f6d4ac286f2..b25d9d28853b5 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -493,17 +493,13 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
     entry->is_mutable = is_mutable;
 
     auto plasma_header = GetPlasmaObjectHeader(entry->object);
-    // The corresponding WriteRelease takes place in Seal.
-    // When an object is first created, the data size is equivalent to
-    // buffer size.
-    // The first creation's version is always 1.
-    RAY_CHECK(entry->next_version_to_write == 1);
-    plasma_header->WriteAcquire(/*next_version_to_write*/ entry->next_version_to_write,
-                                entry->object.data_size);
-    if (entry->is_mutable) {
-      // When the object is first created, it is in writeable state.
-      plasma_header->num_readers = 0;
-    } else {
+    if (!entry->is_mutable) {
+      // The first creation's version is always 1.
+      RAY_CHECK(entry->next_version_to_write == 1);
+      // The corresponding WriteRelease takes place in Seal.
+      // When an object is first created, the data size is equivalent to
+      // buffer size.
+      plasma_header->WriteAcquire(entry->next_version_to_write, data_size);
       // Anyone may read.
       plasma_header->num_readers = -1;
     }
@@ -836,26 +832,31 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) {
     return Status::ObjectAlreadySealed("Seal() called on an already sealed object");
   }
 
-  auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object);
-  plasma_header->WriteRelease(
-      /*write_version=*/object_entry->second->next_version_to_write);
-  // The next Write must pass a higher version.
-  object_entry->second->next_version_to_write++;
   object_entry->second->is_sealed = true;
+  auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object);
+  if (plasma_header->num_readers != 0) {
+    plasma_header->WriteRelease(
+        /*write_version=*/object_entry->second->next_version_to_write);
+    // The next Write must pass a higher version.
+    object_entry->second->next_version_to_write++;
+  } else {
+    // Send the seal request to Plasma. This is the normal Seal path, used for
+    // immutable objects and the initial Create call for mutable objects.
+    RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id));
+    std::vector<uint8_t> buffer;
+    RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer));
+    ObjectID sealed_id;
+    RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id));
+    RAY_CHECK(sealed_id == object_id);
+    // We call PlasmaClient::Release to decrement the number of instances of this
+    // object
+    // that are currently being used by this client. The corresponding increment
+    // happened in plasma_create and was used to ensure that the object was not
+    // released before the call to PlasmaClient::Seal.
+    RAY_RETURN_NOT_OK(Release(object_id));
+  }
 
-  /// Send the seal request to Plasma.
-  RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id));
-  std::vector<uint8_t> buffer;
-  RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer));
-  ObjectID sealed_id;
-  RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id));
-  RAY_CHECK(sealed_id == object_id);
-  // We call PlasmaClient::Release to decrement the number of instances of this
-  // object
-  // that are currently being used by this client. The corresponding increment
-  // happened in plasma_create and was used to ensure that the object was not
-  // released before the call to PlasmaClient::Seal.
-  return Release(object_id);
+  return Status::OK();
 }
 
 Status PlasmaClient::Impl::Abort(const ObjectID &object_id) {
diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc
index 2f96cf139d5b9..66876de42cbcc 100644
--- a/src/ray/object_manager/plasma/store.cc
+++ b/src/ray/object_manager/plasma/store.cc
@@ -31,9 +31,7 @@
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/eventfd.h>
 
-#include <boost/asio/spawn.hpp>
 #include <boost/bind/bind.hpp>
 #include <chrono>
 #include <ctime>

From e40d3c837b790c634baa4f416a6b42c228d3eb06 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Thu, 30 Nov 2023 13:26:48 -0800
Subject: [PATCH 11/66] mutable object -> channel

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/__init__.py                   |  6 +++---
 python/ray/_private/worker.py            | 12 +++++++++---
 python/ray/tests/test_accelerated_dag.py | 12 ++++++------
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/python/ray/__init__.py b/python/ray/__init__.py
index fd32de31e5463..95ac20ceb15fb 100644
--- a/python/ray/__init__.py
+++ b/python/ray/__init__.py
@@ -114,9 +114,9 @@ def _configure_system():
     WORKER_MODE,
     RESTORE_WORKER_MODE,
     SPILL_WORKER_MODE,
-    _create_mutable_object,
-    _put_mutable_object,
-    _release_mutable_object,
+    _create_channel,
+    _write_channel,
+    _end_read_channel,
     cancel,
     get,
     get_actor,
diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py
index d9ed7a72e94c5..fa93f8ea91851 100644
--- a/python/ray/_private/worker.py
+++ b/python/ray/_private/worker.py
@@ -2498,7 +2498,13 @@ def show_in_dashboard(message: str, key: str = "", dtype: str = "text"):
 blocking_get_inside_async_warned = False
 
 
-def _release_mutable_object(object_refs):
+def _end_read_channel(object_refs):
+    """
+    Signal to the writer that the channel is ready to write again. The read
+    begins when the caller calls ray.get and a written value is available. If
+    ray.get is not called first, then this call will block until a value is
+    written, then drop the value.
+    """
     worker = global_worker
     worker.check_connected()
     if isinstance(object_refs, ObjectRef):
@@ -2638,7 +2644,7 @@ def get(
 
 
 @PublicAPI
-def _put_mutable_object(value: Any, object_ref: ObjectRef, num_readers: int):
+def _write_channel(value: Any, object_ref: ObjectRef, num_readers: int):
     worker = global_worker
     worker.check_connected()
 
@@ -2663,7 +2669,7 @@ def _put_mutable_object(value: Any, object_ref: ObjectRef, num_readers: int):
 
 
 @PublicAPI
-def _create_mutable_object(
+def _create_channel(
     buffer_size: int,
 ) -> "ray.ObjectRef":
     worker = global_worker
diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py
index 4adc04bb1e51b..c836bfd856450 100644
--- a/python/ray/tests/test_accelerated_dag.py
+++ b/python/ray/tests/test_accelerated_dag.py
@@ -12,19 +12,19 @@
 
 
 def test_put_local_get(ray_start_regular):
-    ref = ray._create_mutable_object(1000)
+    ref = ray._create_channel(1000)
 
     num_writes = 1000
     for i in range(num_writes):
         val = i.to_bytes(8, "little")
-        ray._put_mutable_object(val, ref, num_readers=1)
+        ray._write_channel(val, ref, num_readers=1)
         assert ray.get(ref) == val
-        ray._release_mutable_object(ref)
+        ray._end_read_channel(ref)
 
 
 @pytest.mark.parametrize("num_readers", [1, 4])
 def test_put_remote_get(ray_start_regular, num_readers):
-    ref = ray._create_mutable_object(1000)
+    ref = ray._create_channel(1000)
 
     @ray.remote(num_cpus=0)
     class Reader:
@@ -35,14 +35,14 @@ def read(self, ref, num_writes):
             for i in range(num_writes):
                 val = i.to_bytes(8, "little")
                 assert ray.get(ref[0]) == val
-                ray._release_mutable_object(ref)
+                ray._end_read_channel(ref)
 
     num_writes = 1000
     readers = [Reader.remote() for _ in range(num_readers)]
     done = [reader.read.remote([ref], num_writes) for reader in readers]
     for i in range(num_writes):
         val = i.to_bytes(8, "little")
-        ray._put_mutable_object(val, ref, num_readers=num_readers)
+        ray._write_channel(val, ref, num_readers=num_readers)
 
     ray.get(done)
 

From b79b7d1ea9c77166e63022a6abe033635b5b8abb Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Thu, 30 Nov 2023 15:25:11 -0800
Subject: [PATCH 12/66] micro

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/_private/ray_perf.py  | 83 ++++++++++++++++++++++++++++++++
 src/ray/object_manager/common.cc |  2 +-
 2 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py
index 316f3baeca846..1d07c49aea89f 100644
--- a/python/ray/_private/ray_perf.py
+++ b/python/ray/_private/ray_perf.py
@@ -288,6 +288,89 @@ def async_actor_multi():
     results += timeit("n:n async-actor calls async", async_actor_multi, m * n)
     ray.shutdown()
 
+    #################################################
+    # Perf tests for channels, used in compiled DAGs.
+    #################################################
+
+    ray.init()
+
+    def put_channel_small(chans, num_readers=1, do_get=False, do_release=False):
+        for chan in chans:
+            ray._write_channel(b"0", chan, num_readers=num_readers)
+            if do_get:
+                ray.get(chan)
+            if do_release:
+                ray._end_read_channel(chan)
+
+    @ray.remote
+    class ChannelReader:
+        def ready(self):
+            return
+
+        def read(self, chans):
+            while True:
+                for chan in chans:
+                    ray.get(chan)
+                    ray._end_read_channel(chan)
+
+    chans = [ray._create_channel(1000)]
+    results += timeit(
+        "local put, single channel calls",
+        lambda: put_channel_small(chans, do_release=True),
+    )
+    results += timeit(
+        "local put:local get, single channel calls",
+        lambda: put_channel_small(chans, do_get=True, do_release=True),
+    )
+
+    chans = [ray._create_channel(1000)]
+    reader = ChannelReader.remote()
+    ray.get(reader.ready.remote())
+    reader.read.remote(chans)
+    results += timeit(
+        "local put:1 remote get, single channel calls", lambda: put_channel_small(chans)
+    )
+    ray.kill(reader)
+
+    n_cpu = multiprocessing.cpu_count() // 2
+    print(f"Testing multiple readers/channels, n={n_cpu}")
+
+    chans = [ray._create_channel(1000)]
+    readers = [ChannelReader.remote() for _ in range(n_cpu)]
+    ray.get([reader.ready.remote() for reader in readers])
+    for reader in readers:
+        reader.read.remote(chans)
+    results += timeit(
+        "local put:n remote get, single channel calls",
+        lambda: put_channel_small(chans, num_readers=n_cpu),
+    )
+    for reader in readers:
+        ray.kill(reader)
+
+    chans = [ray._create_channel(1000) for _ in range(n_cpu)]
+    reader = ChannelReader.remote()
+    ray.get(reader.ready.remote())
+    reader.read.remote(chans)
+    results += timeit(
+        "local put:1 remote get, n channels calls", lambda: put_channel_small(chans)
+    )
+    ray.kill(reader)
+
+    chans = [ray._create_channel(1000) for _ in range(n_cpu)]
+    readers = [ChannelReader.remote() for _ in range(n_cpu)]
+    ray.get([reader.ready.remote() for reader in readers])
+    for chan, reader in zip(chans, readers):
+        reader.read.remote([chan])
+    results += timeit(
+        "local put:n remote get, n channels calls", lambda: put_channel_small(chans)
+    )
+    for reader in readers:
+        ray.kill(reader)
+
+    ############################
+    # End of channel perf tests.
+    ############################
+
     NUM_PGS = 100
     NUM_BUNDLES = 1
     ray.init(resources={"custom": 100})
diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc
index bd1cc168481c8..970fb9d096370 100644
--- a/src/ray/object_manager/common.cc
+++ b/src/ray/object_manager/common.cc
@@ -40,7 +40,7 @@ void PlasmaObjectHeader::Destroy() {
 // This has to be called only when reader lock is acquired
 // via ReadAcquire.
 uint64_t PlasmaObjectHeader::GetDataSize() const {
-  RAY_CHECK_GE(num_read_releases_remaining, 0)
+  RAY_CHECK_NE(num_read_releases_remaining, 0)
       << "ReadAcquire has to be called before calling this method.";
   return data_size;
 }

From 5ea0fe3fd887303dfd831ec8d46e29e1d0ad671c Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Thu, 30 Nov 2023 17:07:50 -0800
Subject: [PATCH 13/66] support different metadata

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/tests/test_accelerated_dag.py  | 18 ++++++
 src/ray/object_manager/common.cc          | 17 ++++--
 src/ray/object_manager/common.h           |  9 ++-
 src/ray/object_manager/plasma/client.cc   | 68 +++++++++++++----------
 src/ray/object_manager/plasma/common.h    |  4 ++
 src/ray/object_manager/plasma/plasma.fbs  |  4 ++
 src/ray/object_manager/plasma/plasma.h    |  3 +
 src/ray/object_manager/plasma/protocol.cc |  4 ++
 8 files changed, 92 insertions(+), 35 deletions(-)

diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py
index c836bfd856450..e8ea4e7eb4a13 100644
--- a/python/ray/tests/test_accelerated_dag.py
+++ b/python/ray/tests/test_accelerated_dag.py
@@ -3,6 +3,7 @@
 import os
 import sys
 
+import numpy as np
 import pytest
 
 import ray
@@ -22,6 +23,23 @@ def test_put_local_get(ray_start_regular):
         ray._end_read_channel(ref)
 
 
+def test_put_different_meta(ray_start_regular):
+    ref = ray._create_channel(1000)
+
+    def _test(val):
+        ray._write_channel(val, ref, num_readers=1)
+        if isinstance(val, np.ndarray):
+            assert np.array_equal(ray.get(ref), val)
+        else:
+            assert ray.get(ref) == val
+        ray._end_read_channel(ref)
+
+    _test(b"hello")
+    _test("hello")
+    _test(1000)
+    _test(np.random.rand(10))
+
+
 @pytest.mark.parametrize("num_readers", [1, 4])
 def test_put_remote_get(ray_start_regular, num_readers):
     ref = ray._create_channel(1000)
diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc
index 970fb9d096370..7c95845012896 100644
--- a/src/ray/object_manager/common.cc
+++ b/src/ray/object_manager/common.cc
@@ -10,7 +10,8 @@ void PrintPlasmaObjectHeader(const PlasmaObjectHeader *header) {
                  << "\n"
                  << "num_read_releases_remaining: " << header->num_read_releases_remaining
                  << "\n"
-                 << "data_size: " << header->data_size << "\n";
+                 << "data_size: " << header->data_size << "\n"
+                 << "metadata_size: " << header->metadata_size << "\n";
 }
 
 void PlasmaObjectHeader::Init() {
@@ -45,8 +46,13 @@ uint64_t PlasmaObjectHeader::GetDataSize() const {
   return data_size;
 }
 
-void PlasmaObjectHeader::WriteAcquire(int64_t write_version, uint64_t new_size) {
-  RAY_LOG(DEBUG) << "WriteAcquire. version: " << write_version;
+void PlasmaObjectHeader::WriteAcquire(int64_t write_version,
+                                      uint64_t write_data_size,
+                                      uint64_t write_metadata_size,
+                                      int64_t write_num_readers) {
+  RAY_LOG(DEBUG) << "WriteAcquire. version: " << write_version << ", data size "
+                 << write_data_size << ", metadata size " << write_metadata_size
+                 << ", num readers: " << write_num_readers;
   sem_wait(&rw_semaphore);
   RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0);
   PrintPlasmaObjectHeader(this);
@@ -58,9 +64,10 @@ void PlasmaObjectHeader::WriteAcquire(int64_t write_version, uint64_t new_size)
       << " is more than 1 greater than current version " << version
       << ". Are you sure this is the only writer?";
 
-  num_readers = 0;
   version = write_version;
-  data_size = new_size;
+  data_size = write_data_size;
+  metadata_size = write_metadata_size;
+  num_readers = write_num_readers;
 
   RAY_LOG(DEBUG) << "WriteAcquire done";
   PrintPlasmaObjectHeader(this);
diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h
index 395c86ee8223b..31083942a928d 100644
--- a/src/ray/object_manager/common.h
+++ b/src/ray/object_manager/common.h
@@ -80,8 +80,13 @@ struct PlasmaObjectHeader {
   // Blocks until there are no more readers.
   // NOTE: Caller should ensure there is one writer at a time.
   /// \param write_version The new version for write.
-  /// \param new_size The new data size of the object.
-  void WriteAcquire(int64_t write_version, uint64_t new_data_size);
+  /// \param data_size The new data size of the object.
+  /// \param metadata_size The new metadata size of the object.
+  /// \param num_readers The number of readers for the object.
+  void WriteAcquire(int64_t write_version,
+                    uint64_t data_size,
+                    uint64_t metadata_size,
+                    int64_t num_readers);
 
   // Call after completing a write to signal that readers may read.
   // num_readers should be set before calling this.
diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index b25d9d28853b5..8a5637957a3ef 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -168,8 +168,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
              ObjectBuffer *object_buffers,
              bool is_from_worker);
 
-  ray::PlasmaObjectHeader *EnsureGetAcquired(
-      std::unique_ptr<ObjectInUseEntry> &object_entry);
+  void EnsureGetAcquired(std::unique_ptr<ObjectInUseEntry> &object_entry);
 
   Status GetRelease(const ObjectID &object_id);
 
@@ -417,18 +416,14 @@ Status PlasmaClient::Impl::WriteAcquireMutableObject(const ObjectID &object_id,
 
   // Wait for no readers.
   auto plasma_header = GetPlasmaObjectHeader(entry->object);
-  // NOTE: entry->object.data_size is the size of the data buffer.
-  // When the object is shared, we can have object size smaller than the data buffer.
   // TODO(swang): Better exception.
-  // TODO(swang): Support data size larger than allocated buffer.
-  RAY_CHECK(data_size <= entry->object.data_size)
-      << "Cannot write mutable data size " << data_size
-      << " larger than allocated buffer size " << entry->object.data_size;
-  // TODO(swang): Support different metadata size.
-  RAY_CHECK(metadata_size == entry->object.metadata_size)
-      << "Metadata size must stay the same";
-  plasma_header->WriteAcquire(entry->next_version_to_write, data_size);
-  plasma_header->num_readers = num_readers;
+  // TODO(swang): Support data + metadata size larger than allocated buffer.
+  RAY_CHECK(data_size + metadata_size <= entry->object.allocated_size)
+      << "Cannot write mutable data size " << data_size << " + metadata size "
+      << metadata_size << " larger than allocated buffer size "
+      << entry->object.allocated_size;
+  plasma_header->WriteAcquire(
+      entry->next_version_to_write, data_size, metadata_size, num_readers);
 
   // Prepare the data buffer and return to the client instead of sending
   // the IPC to object store.
@@ -439,8 +434,7 @@ Status PlasmaClient::Impl::WriteAcquireMutableObject(const ObjectID &object_id,
       data_size);
   if (metadata != NULL) {
     // Copy the metadata to the buffer.
-    memcpy(
-        (*data)->Data() + entry->object.data_size, metadata, entry->object.metadata_size);
+    memcpy((*data)->Data() + data_size, metadata, metadata_size);
   }
 
   entry->is_sealed = false;
@@ -499,9 +493,11 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
       // The corresponding WriteRelease takes place in Seal.
       // When an object is first created, the data size is equivalent to
       // buffer size.
-      plasma_header->WriteAcquire(entry->next_version_to_write, data_size);
-      // Anyone may read.
-      plasma_header->num_readers = -1;
+      plasma_header->WriteAcquire(entry->next_version_to_write,
+                                  data_size,
+                                  metadata_size,
+                                  // Anyone may read an immutable object.
+                                  /*num_readers=*/-1);
     }
   }
 
@@ -569,11 +565,14 @@ Status PlasmaClient::Impl::GetBuffers(
       all_present = false;
     } else {
       // Wait for the object to become ready to read.
-      auto plasma_header = EnsureGetAcquired(object_entry->second);
+      EnsureGetAcquired(object_entry->second);
 
       PlasmaObject *object = &object_entry->second->object;
 
       std::shared_ptr<Buffer> physical_buf;
+      RAY_LOG(DEBUG) << "Plasma Get " << object_ids[i]
+                     << ", data size: " << object->data_size
+                     << ", metadata size: " << object->metadata_size;
       if (object->device_num == 0) {
         uint8_t *data = LookupMmappedFile(object->store_fd);
         physical_buf = std::make_shared<SharedMemoryBuffer>(
@@ -582,8 +581,8 @@ Status PlasmaClient::Impl::GetBuffers(
         RAY_LOG(FATAL) << "GPU library is not enabled.";
       }
       physical_buf = wrap_buffer(object_ids[i], physical_buf);
-      auto data_size = plasma_header->GetDataSize();
-      object_buffers[i].data = SharedMemoryBuffer::Slice(physical_buf, 0, data_size);
+      object_buffers[i].data =
+          SharedMemoryBuffer::Slice(physical_buf, 0, object->data_size);
       object_buffers[i].metadata = SharedMemoryBuffer::Slice(
           physical_buf, object->data_size, object->metadata_size);
       object_buffers[i].device_num = object->device_num;
@@ -647,9 +646,11 @@ Status PlasmaClient::Impl::GetBuffers(
 
       // Wait for the object to become ready to read.
       RAY_CHECK(!object_entry->read_acquired);
-      auto plasma_header = EnsureGetAcquired(object_entry);
-      auto data_size = plasma_header->GetDataSize();
+      EnsureGetAcquired(object_entry);
       std::shared_ptr<Buffer> physical_buf;
+      RAY_LOG(DEBUG) << "Plasma Get " << received_object_ids[i]
+                     << ", data size: " << object->data_size
+                     << ", metadata size: " << object->metadata_size;
       if (object->device_num == 0) {
         uint8_t *data = LookupMmappedFile(object->store_fd);
         physical_buf = std::make_shared<SharedMemoryBuffer>(
@@ -659,7 +660,8 @@ Status PlasmaClient::Impl::GetBuffers(
       }
       // Finish filling out the return values.
       physical_buf = wrap_buffer(object_ids[i], physical_buf);
-      object_buffers[i].data = SharedMemoryBuffer::Slice(physical_buf, 0, data_size);
+      object_buffers[i].data =
+          SharedMemoryBuffer::Slice(physical_buf, 0, object->data_size);
       object_buffers[i].metadata = SharedMemoryBuffer::Slice(
           physical_buf, object->data_size, object->metadata_size);
       object_buffers[i].device_num = object->device_num;
@@ -689,12 +691,12 @@ Status PlasmaClient::Impl::Get(const std::vector<ObjectID> &object_ids,
       &object_ids[0], num_objects, timeout_ms, wrap_buffer, &(*out)[0], is_from_worker);
 }
 
-ray::PlasmaObjectHeader *PlasmaClient::Impl::EnsureGetAcquired(
+void PlasmaClient::Impl::EnsureGetAcquired(
     std::unique_ptr<ObjectInUseEntry> &object_entry) {
   PlasmaObject *object = &object_entry->object;
   auto plasma_header = GetPlasmaObjectHeader(*object);
   if (object_entry->read_acquired) {
-    return plasma_header;
+    return;
   }
 
   int64_t version_read = plasma_header->ReadAcquire(object_entry->next_version_to_read);
@@ -702,8 +704,17 @@ ray::PlasmaObjectHeader *PlasmaClient::Impl::EnsureGetAcquired(
   if (version_read > 0) {
     object_entry->is_mutable = true;
     object_entry->next_version_to_read = version_read;
+
+    // The data and metadata size may have changed, so update here before we
+    // create the Get buffer to return.
+    object_entry->object.data_size = plasma_header->data_size;
+    object_entry->object.metadata_size = plasma_header->metadata_size;
+    object_entry->object.metadata_offset =
+        object_entry->object.data_offset + object_entry->object.data_size;
+    RAY_CHECK(object_entry->object.data_size + object_entry->object.metadata_size <=
+              object_entry->object.allocated_size);
   }
-  return plasma_header;
+  return;
 }
 
 Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) {
@@ -725,8 +736,9 @@ Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) {
         "ray.release() called on an object that is not mutable");
   }
 
-  auto plasma_header = EnsureGetAcquired(entry);
+  EnsureGetAcquired(entry);
   RAY_LOG(DEBUG) << "Release shared object " << object_id;
+  auto plasma_header = GetPlasmaObjectHeader(entry->object);
   plasma_header->ReadRelease(entry->next_version_to_read);
   // The next read needs to read at least this version.
   entry->next_version_to_read++;
diff --git a/src/ray/object_manager/plasma/common.h b/src/ray/object_manager/plasma/common.h
index d74eb88cec8b8..d24a110c32e44 100644
--- a/src/ray/object_manager/plasma/common.h
+++ b/src/ray/object_manager/plasma/common.h
@@ -140,6 +140,10 @@ class LocalObject {
                               GetObjectInfo().data_size;
     object->data_size = GetObjectInfo().data_size;
     object->metadata_size = GetObjectInfo().metadata_size;
+    // Senders and receivers of a channel may store different data and metadata
+    // sizes locally depending on what data is written to the channel, but the
+    // plasma store keeps the original data and metadata size.
+    object->allocated_size = object->data_size + object->metadata_size;
     object->device_num = GetAllocation().device_num;
     object->mmap_size = GetAllocation().mmap_size;
   }
diff --git a/src/ray/object_manager/plasma/plasma.fbs b/src/ray/object_manager/plasma/plasma.fbs
index ba2df089c6032..0c4f8ac66f48b 100644
--- a/src/ray/object_manager/plasma/plasma.fbs
+++ b/src/ray/object_manager/plasma/plasma.fbs
@@ -106,6 +106,10 @@ struct PlasmaObjectSpec {
   metadata_offset: ulong;
   // The size in bytes of the metadata.
   metadata_size: ulong;
+  // The allocated size. This is just data_size + metadata_size
+  // for immutable objects, but for mutable objects, the data size
+  // and metadata size may change.
+  allocated_size: ulong;
   // Device to create buffer on.
   device_num: int;
 }
diff --git a/src/ray/object_manager/plasma/plasma.h b/src/ray/object_manager/plasma/plasma.h
index 775226c922665..bb21f394d5b0c 100644
--- a/src/ray/object_manager/plasma/plasma.h
+++ b/src/ray/object_manager/plasma/plasma.h
@@ -48,6 +48,9 @@ struct PlasmaObject {
   int64_t data_size;
   /// The size in bytes of the metadata.
   int64_t metadata_size;
+  /// The size in bytes that was allocated. data_size + metadata_size must fit
+  /// within this.
+  int64_t allocated_size;
   /// Device number object is on.
   int device_num;
   /// Set if device_num is equal to 0.
diff --git a/src/ray/object_manager/plasma/protocol.cc b/src/ray/object_manager/plasma/protocol.cc
index 50f1f60d332ec..c041486bdefec 100644
--- a/src/ray/object_manager/plasma/protocol.cc
+++ b/src/ray/object_manager/plasma/protocol.cc
@@ -268,6 +268,7 @@ Status SendCreateReply(const std::shared_ptr<Client> &client,
                                  object.data_size,
                                  object.metadata_offset,
                                  object.metadata_size,
+                                 object.allocated_size,
                                  object.device_num);
   auto object_string = fbb.CreateString(object_id.Binary());
   fb::PlasmaCreateReplyBuilder crb(fbb);
@@ -309,6 +310,7 @@ Status ReadCreateReply(uint8_t *data,
   object->data_size = message->plasma_object()->data_size();
   object->metadata_offset = message->plasma_object()->metadata_offset();
   object->metadata_size = message->plasma_object()->metadata_size();
+  object->allocated_size = message->plasma_object()->allocated_size();
 
   store_fd->first = INT2FD(message->store_fd());
   store_fd->second = message->unique_fd_id();
@@ -624,6 +626,7 @@ Status SendGetReply(const std::shared_ptr<Client> &client,
                                        object.data_size,
                                        object.metadata_offset,
                                        object.metadata_size,
+                                       object.allocated_size,
                                        object.device_num));
   }
   std::vector<int> store_fds_as_int;
@@ -665,6 +668,7 @@ Status ReadGetReply(uint8_t *data,
     plasma_objects[i].data_size = object->data_size();
     plasma_objects[i].metadata_offset = object->metadata_offset();
     plasma_objects[i].metadata_size = object->metadata_size();
+    plasma_objects[i].allocated_size = object->allocated_size();
     plasma_objects[i].device_num = object->device_num();
   }
   RAY_CHECK(message->store_fds()->size() == message->mmap_sizes()->size());

From cbe257fe381507342b76de30736b7ed4581a847f Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Thu, 30 Nov 2023 17:53:25 -0800
Subject: [PATCH 14/66] better error message

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/tests/test_accelerated_dag.py |  5 +++++
 src/ray/object_manager/plasma/client.cc  | 12 +++++++-----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py
index e8ea4e7eb4a13..fa7805a5b9de0 100644
--- a/python/ray/tests/test_accelerated_dag.py
+++ b/python/ray/tests/test_accelerated_dag.py
@@ -39,6 +39,11 @@ def _test(val):
     _test(1000)
     _test(np.random.rand(10))
 
+    with pytest.raises(ValueError):
+        _test(np.random.rand(100))
+
+    _test(np.random.rand(1))
+
 
 @pytest.mark.parametrize("num_readers", [1, 4])
 def test_put_remote_get(ray_start_regular, num_readers):
diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index 8a5637957a3ef..d76f028c0d916 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -416,12 +416,14 @@ Status PlasmaClient::Impl::WriteAcquireMutableObject(const ObjectID &object_id,
 
   // Wait for no readers.
   auto plasma_header = GetPlasmaObjectHeader(entry->object);
-  // TODO(swang): Better exception.
   // TODO(swang): Support data + metadata size larger than allocated buffer.
-  RAY_CHECK(data_size + metadata_size <= entry->object.allocated_size)
-      << "Cannot write mutable data size " << data_size << " + metadata size "
-      << metadata_size << " larger than allocated buffer size "
-      << entry->object.allocated_size;
+  if (data_size + metadata_size > entry->object.allocated_size) {
+    return Status::InvalidArgument("Serialized size of mutable data (" +
+                                   std::to_string(data_size) + ") + metadata size (" +
+                                   std::to_string(metadata_size) +
+                                   ") is larger than allocated buffer size " +
+                                   std::to_string(entry->object.allocated_size));
+  }
   plasma_header->WriteAcquire(
       entry->next_version_to_write, data_size, metadata_size, num_readers);
 

From a68cefd2e974f1e3b56f79e4bcb36a628e25c155 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Fri, 1 Dec 2023 10:48:49 -0800
Subject: [PATCH 15/66] cleanup

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/__init__.py                        |   3 -
 python/ray/_private/ray_perf.py               |  22 +--
 python/ray/_private/worker.py                 |  91 +++-------
 python/ray/_raylet.pxd                        |   2 +-
 python/ray/_raylet.pyx                        |  73 ++++----
 python/ray/experimental/channel.py            | 142 +++++++++++++++
 python/ray/includes/libcoreworker.pxd         |   6 +-
 python/ray/tests/test_accelerated_dag.py      |  33 ++--
 src/ray/core_worker/core_worker.cc            |  47 +++--
 src/ray/core_worker/core_worker.h             |  39 +++-
 .../store_provider/plasma_store_provider.cc   |  61 ++++---
 .../store_provider/plasma_store_provider.h    |  36 +++-
 src/ray/object_manager/common.cc              |   9 -
 src/ray/object_manager/common.h               |  33 ++--
 src/ray/object_manager/plasma/client.cc       | 166 +++++++++---------
 src/ray/object_manager/plasma/client.h        |  50 ++++--
 16 files changed, 519 insertions(+), 294 deletions(-)
 create mode 100644 python/ray/experimental/channel.py

diff --git a/python/ray/__init__.py b/python/ray/__init__.py
index 95ac20ceb15fb..e74749ab6e8fa 100644
--- a/python/ray/__init__.py
+++ b/python/ray/__init__.py
@@ -114,9 +114,6 @@ def _configure_system():
     WORKER_MODE,
     RESTORE_WORKER_MODE,
     SPILL_WORKER_MODE,
-    _create_channel,
-    _write_channel,
-    _end_read_channel,
     cancel,
     get,
     get_actor,
diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py
index 1d07c49aea89f..dcc49d42b2926 100644
--- a/python/ray/_private/ray_perf.py
+++ b/python/ray/_private/ray_perf.py
@@ -8,6 +8,8 @@
 import multiprocessing
 import ray
 
+import ray.experimental.channel as ray_channel
+
 logger = logging.getLogger(__name__)
 
 
@@ -296,11 +298,11 @@ def async_actor_multi():
 
     def put_channel_small(chans, num_readers=1, do_get=False, do_release=False):
         for chan in chans:
-            ray._write_channel(b"0", chan, num_readers=num_readers)
+            chan.write(b"0", num_readers=num_readers)
             if do_get:
-                ray.get(chan)
+                chan.begin_read()
             if do_release:
-                ray._end_read_channel(chan)
+                chan.end_read()
 
     @ray.remote
     class ChannelReader:
@@ -310,10 +312,10 @@ def ready(self):
         def read(self, chans):
             while True:
                 for chan in chans:
-                    ray.get(chan)
-                    ray._end_read_channel(chan)
+                    chan.begin_read()
+                    chan.end_read()
 
-    chans = [ray._create_channel(1000)]
+    chans = [ray_channel.Channel(1000)]
     results += timeit(
         "local put, single channel calls",
         lambda: put_channel_small(chans, do_release=True),
@@ -323,7 +325,7 @@ def read(self, chans):
         lambda: put_channel_small(chans, do_get=True, do_release=True),
     )
 
-    chans = [ray._create_channel(1000)]
+    chans = [ray_channel.Channel(1000)]
     reader = ChannelReader.remote()
     ray.get(reader.ready.remote())
     reader.read.remote(chans)
@@ -335,7 +337,7 @@ def read(self, chans):
     n_cpu = multiprocessing.cpu_count() // 2
     print(f"Testing multiple readers/channels, n={n_cpu}")
 
-    chans = [ray._create_channel(1000)]
+    chans = [ray_channel.Channel(1000)]
     readers = [ChannelReader.remote() for _ in range(n_cpu)]
     ray.get([reader.ready.remote() for reader in readers])
     for reader in readers:
@@ -347,7 +349,7 @@ def read(self, chans):
     for reader in readers:
         ray.kill(reader)
 
-    chans = [ray._create_channel(1000) for _ in range(n_cpu)]
+    chans = [ray_channel.Channel(1000) for _ in range(n_cpu)]
     reader = ChannelReader.remote()
     ray.get(reader.ready.remote())
     reader.read.remote(chans)
@@ -356,7 +358,7 @@ def read(self, chans):
     )
     ray.kill(reader)
 
-    chans = [ray._create_channel(1000) for _ in range(n_cpu)]
+    chans = [ray_channel.Channel(1000) for _ in range(n_cpu)]
     readers = [ChannelReader.remote() for _ in range(n_cpu)]
     ray.get([reader.ready.remote() for reader in readers])
     for chan, reader in zip(chans, readers):
diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py
index fa93f8ea91851..216a651df87f7 100644
--- a/python/ray/_private/worker.py
+++ b/python/ray/_private/worker.py
@@ -688,7 +688,13 @@ def set_mode(self, mode):
     def set_load_code_from_local(self, load_code_from_local):
         self._load_code_from_local = load_code_from_local
 
-    def put_object(self, value, object_ref=None, owner_address=None, is_mutable=False):
+    def put_object(
+        self,
+        value: Any,
+        object_ref: Optional["ray.ObjectRef"] = None,
+        owner_address: Optional[str] = None,
+        _is_experimental_mutable_object: bool = False,
+    ):
         """Put value in the local object store with object reference `object_ref`.
 
         This assumes that the value for `object_ref` has not yet been placed in
@@ -703,6 +709,10 @@ def put_object(self, value, object_ref=None, owner_address=None, is_mutable=Fals
             object_ref: The object ref of the value to be
                 put. If None, one will be generated.
             owner_address: The serialized address of object's owner.
+            _is_experimental_mutable_object: An experimental flag for mutable
+                objects. If True, then the returned object will not have a
+                valid value. The object must be written to using the
+                ray.experimental.channel API before readers can read.
 
         Returns:
             ObjectRef: The object ref the object was put under.
@@ -739,7 +749,7 @@ def put_object(self, value, object_ref=None, owner_address=None, is_mutable=Fals
 
         # If the object is mutable, then the raylet should never read the
         # object. Instead, clients will keep the object pinned.
-        pin_object = not is_mutable
+        pin_object = not _is_experimental_mutable_object
 
         # This *must* be the first place that we construct this python
         # ObjectRef because an entry with 0 local references is created when
@@ -753,7 +763,7 @@ def put_object(self, value, object_ref=None, owner_address=None, is_mutable=Fals
                 object_ref=object_ref,
                 pin_object=pin_object,
                 owner_address=owner_address,
-                is_mutable=is_mutable,
+                _is_experimental_mutable_object=_is_experimental_mutable_object,
             ),
             # The initial local reference is already acquired internally.
             skip_adding_local_ref=True,
@@ -775,7 +785,12 @@ def deserialize_objects(self, data_metadata_pairs, object_refs):
             context = self.get_serialization_context()
             return context.deserialize_objects(data_metadata_pairs, object_refs)
 
-    def get_objects(self, object_refs: list, timeout: Optional[float] = None):
+    def get_objects(
+        self,
+        object_refs: list,
+        timeout: Optional[float] = None,
+        _is_experimental_mutable_object: bool = False,
+    ):
         """Get the values in the object store associated with the IDs.
 
         Return the values from the local object store for object_refs. This
@@ -791,6 +806,10 @@ def get_objects(self, object_refs: list, timeout: Optional[float] = None):
             list: List of deserialized objects
             bytes: UUID of the debugger breakpoint we should drop
                 into or b"" if there is no breakpoint.
+            _is_experimental_mutable_object: An experimental flag for mutable
+                objects. If True, then wait until there is a value available to
+                read. The object must also already be local, or else the get
+                call will hang.
         """
         # Make sure that the values are object refs.
         for object_ref in object_refs:
@@ -802,7 +821,10 @@ def get_objects(self, object_refs: list, timeout: Optional[float] = None):
 
         timeout_ms = int(timeout * 1000) if timeout is not None else -1
         data_metadata_pairs = self.core_worker.get_objects(
-            object_refs, self.current_task_id, timeout_ms
+            object_refs,
+            self.current_task_id,
+            timeout_ms,
+            _is_experimental_mutable_object,
         )
         debugger_breakpoint = b""
         for data, metadata in data_metadata_pairs:
@@ -2498,20 +2520,6 @@ def show_in_dashboard(message: str, key: str = "", dtype: str = "text"):
 blocking_get_inside_async_warned = False
 
 
-def _end_read_channel(object_refs):
-    """
-    Signal to the writer that the channel is ready to write again. The read
-    begins when the caller calls ray.get and a written value is available. If
-    ray.get is not called first, then this call will block until a value is
-    written, then drop the value.
-    """
-    worker = global_worker
-    worker.check_connected()
-    if isinstance(object_refs, ObjectRef):
-        object_refs = [object_refs]
-    worker.core_worker.get_release(object_refs)
-
-
 @overload
 def get(
     object_refs: "Sequence[ObjectRef[Any]]", *, timeout: Optional[float] = None
@@ -2643,51 +2651,6 @@ def get(
         return values
 
 
-@PublicAPI
-def _write_channel(value: Any, object_ref: ObjectRef, num_readers: int):
-    worker = global_worker
-    worker.check_connected()
-
-    if num_readers <= 0:
-        raise ValueError("``num_readers`` must be a positive integer.")
-
-    try:
-        serialized_value = worker.get_serialization_context().serialize(value)
-    except TypeError as e:
-        sio = io.StringIO()
-        ray.util.inspect_serializability(value, print_file=sio)
-        msg = (
-            "Could not serialize the put value " f"{repr(value)}:\n" f"{sio.getvalue()}"
-        )
-        raise TypeError(msg) from e
-
-    worker.core_worker.put_serialized_object_to_mutable_plasma_object(
-        serialized_value,
-        object_ref,
-        num_readers,
-    )
-
-
-@PublicAPI
-def _create_channel(
-    buffer_size: int,
-) -> "ray.ObjectRef":
-    worker = global_worker
-    worker.check_connected()
-
-    value = b"0" * buffer_size
-
-    try:
-        object_ref = worker.put_object(value, owner_address=None, is_mutable=True)
-    except ObjectStoreFullError:
-        logger.info(
-            "Put failed since the value was either too large or the "
-            "store was full of pinned objects."
-        )
-        raise
-    return object_ref
-
-
 @PublicAPI
 @client_mode_hook
 def put(
diff --git a/python/ray/_raylet.pxd b/python/ray/_raylet.pxd
index f4f54cffacec0..5d47073b74e8a 100644
--- a/python/ray/_raylet.pxd
+++ b/python/ray/_raylet.pxd
@@ -135,7 +135,7 @@ cdef class CoreWorker:
                             c_bool created_by_worker,
                             owner_address=*,
                             c_bool inline_small_object=*,
-                            c_bool is_mutable=*)
+                            c_bool is_experimental_mutable_object=*)
     cdef unique_ptr[CAddress] _convert_python_address(self, address=*)
     cdef store_task_output(
             self, serialized_object,
diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index 3aa2422180c07..44694657d9c61 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -3324,25 +3324,19 @@ cdef class CoreWorker:
         return self.plasma_event_handler
 
     def get_objects(self, object_refs, TaskID current_task_id,
-                    int64_t timeout_ms=-1):
+                    int64_t timeout_ms=-1,
+                    c_bool _is_experimental_mutable_object=False):
         cdef:
             c_vector[shared_ptr[CRayObject]] results
             CTaskID c_task_id = current_task_id.native()
             c_vector[CObjectID] c_object_ids = ObjectRefsToVector(object_refs)
         with nogil:
             op_status = CCoreWorkerProcess.GetCoreWorker().Get(
-                c_object_ids, timeout_ms, &results)
+                c_object_ids, timeout_ms, _is_experimental_mutable_object, &results)
         check_status(op_status)
 
         return RayObjectsToDataMetadataPairs(results)
 
-    def get_release(self, object_refs):
-        cdef:
-            c_vector[CObjectID] c_object_ids = ObjectRefsToVector(object_refs)
-        with nogil:
-            op_status = CCoreWorkerProcess.GetCoreWorker().GetRelease(c_object_ids)
-        check_status(op_status)
-
     def get_if_local(self, object_refs):
         """Get objects from local plasma store directly
         without a fetch request to raylet."""
@@ -3374,7 +3368,7 @@ cdef class CoreWorker:
                             c_bool created_by_worker,
                             owner_address=None,
                             c_bool inline_small_object=True,
-                            c_bool is_mutable=False,
+                            c_bool is_experimental_mutable_object=False,
                             ):
         cdef:
             unique_ptr[CAddress] c_owner_address
@@ -3385,7 +3379,8 @@ cdef class CoreWorker:
             with nogil:
                 check_status(CCoreWorkerProcess.GetCoreWorker()
                              .CreateOwnedAndIncrementLocalRef(
-                             is_mutable, metadata, data_size, contained_ids,
+                             is_experimental_mutable_object, metadata,
+                             data_size, contained_ids,
                              c_object_id, data, created_by_worker,
                              move(c_owner_address),
                              inline_small_object))
@@ -3474,10 +3469,10 @@ cdef class CoreWorker:
                             generator_id=CObjectID.Nil(),
                             owner_address=c_owner_address))
 
-    def put_serialized_object_to_mutable_plasma_object(self, serialized_object,
-                                                       ObjectRef object_ref,
-                                                       num_readers,
-                                                       ):
+    def experimental_mutable_object_put_serialized(self, serialized_object,
+                                                   ObjectRef object_ref,
+                                                   num_readers,
+                                                   ):
         cdef:
             CObjectID c_object_id = object_ref.native()
             shared_ptr[CBuffer] data
@@ -3485,13 +3480,14 @@ cdef class CoreWorker:
 
         metadata = string_to_buffer(serialized_object.metadata)
         data_size = serialized_object.total_bytes
-        check_status(CCoreWorkerProcess.GetCoreWorker().WriteAcquireMutableObject(
-                c_object_id,
-                metadata,
-                data_size,
-                num_readers,
-                &data,
-                ))
+        check_status(CCoreWorkerProcess.GetCoreWorker()
+                     .ExperimentalMutableObjectWriteAcquire(
+                         c_object_id,
+                         metadata,
+                         data_size,
+                         num_readers,
+                         &data,
+                         ))
         if data_size > 0:
             (<SerializedObject>serialized_object).write_to(
                 Buffer.make(data))
@@ -3501,13 +3497,30 @@ cdef class CoreWorker:
                         generator_id=CObjectID.Nil(),
                         owner_address=null_owner_address))
 
-    def put_serialized_object_and_increment_local_ref(self, serialized_object,
-                                                      ObjectRef object_ref=None,
-                                                      c_bool pin_object=True,
-                                                      owner_address=None,
-                                                      c_bool inline_small_object=True,
-                                                      c_bool is_mutable=False,
-                                                      ):
+    def experimental_mutable_object_read_release(self, object_refs):
+        """
+        For experimental.channel.Channel.
+
+        Signal to the writer that the channel is ready to write again. The read
+        began when the caller calls ray.get and a written value is available. If
+        ray.get is not called first, then this call will block until a value is
+        written, then drop the value.
+        """
+        cdef:
+            c_vector[CObjectID] c_object_ids = ObjectRefsToVector(object_refs)
+        with nogil:
+            op_status = (CCoreWorkerProcess.GetCoreWorker()
+                         .ExperimentalMutableObjectReadRelease(c_object_ids))
+        check_status(op_status)
+
+    def put_serialized_object_and_increment_local_ref(
+            self, serialized_object,
+            ObjectRef object_ref=None,
+            c_bool pin_object=True,
+            owner_address=None,
+            c_bool inline_small_object=True,
+            c_bool _is_experimental_mutable_object=False,
+            ):
         cdef:
             CObjectID c_object_id
             shared_ptr[CBuffer] data
@@ -3524,7 +3537,7 @@ cdef class CoreWorker:
             metadata, total_bytes, object_ref,
             contained_object_ids,
             &c_object_id, &data, True, owner_address, inline_small_object,
-            is_mutable)
+            _is_experimental_mutable_object)
 
         logger.debug(
             f"Serialized object size of {c_object_id.Hex()} is {total_bytes} bytes")
diff --git a/python/ray/experimental/channel.py b/python/ray/experimental/channel.py
new file mode 100644
index 0000000000000..42f82e4aa3398
--- /dev/null
+++ b/python/ray/experimental/channel.py
@@ -0,0 +1,142 @@
+import io
+import logging
+from typing import Any, Optional
+
+import ray
+from ray.util.annotations import PublicAPI
+
+# Logger for this module. It should be configured at the entry point
+# into the program using Ray. Ray provides a default configuration at
+# entry/init points.
+logger = logging.getLogger(__name__)
+
+
+def _create_channel_ref(
+    buffer_size: int,
+) -> "ray.ObjectRef":
+    """
+    Create a channel that can be read and written by co-located Ray processes.
+
+    The channel has no buffer, so the writer will block until reader(s) have
+    read the previous value. Only the channel creator may write to the channel.
+
+    Args:
+        buffer_size: The number of bytes to allocate for the object data and
+            metadata. Writes to the channel must produce serialized data and
+            metadata less than or equal to this value.
+    Returns:
+        Channel: A wrapper around ray.ObjectRef.
+    """
+    worker = ray._private.worker.global_worker
+    worker.check_connected()
+
+    value = b"0" * buffer_size
+
+    try:
+        object_ref = worker.put_object(
+            value, owner_address=None, _is_experimental_mutable_object=True
+        )
+    except ray.exceptions.ObjectStoreFullError:
+        logger.info(
+            "Put failed since the value was either too large or the "
+            "store was full of pinned objects."
+        )
+        raise
+    return object_ref
+
+
+@PublicAPI(stability="alpha")
+class Channel:
+    """
+    A wrapper type for ray.ObjectRef. Currently supports ray.get but not
+    ray.wait.
+    """
+
+    def __init__(self, buffer_size: Optional[int] = None):
+        """
+        Create a channel that can be read and written by co-located Ray processes.
+
+        Only the caller may write to the channel. The channel has no buffer,
+        so the writer will block until reader(s) have read the previous value.
+
+        Args:
+            buffer_size: The number of bytes to allocate for the object data and
+                metadata. Writes to the channel must produce serialized data and
+                metadata less than or equal to this value.
+        Returns:
+            Channel: A wrapper around ray.ObjectRef.
+        """
+        if buffer_size is None:
+            self._base_ref = None
+        else:
+            self._base_ref = _create_channel_ref(buffer_size)
+
+        self.worker = ray._private.worker.global_worker
+        self.worker.check_connected()
+
+    @staticmethod
+    def _from_base_ref(base_ref: "ray.ObjectRef") -> "Channel":
+        chan = Channel()
+        chan._base_ref = base_ref
+        return chan
+
+    def __reduce__(self):
+        return self._from_base_ref, (self._base_ref,)
+
+    def write(self, value: Any, num_readers: int):
+        """
+        Write a value to the channel.
+
+        Blocks if there are still pending readers for the previous value. The
+        writer may not write again until the specified number of readers have
+        called ``end_read_channel``.
+
+        Args:
+            value: The value to write.
+            num_readers: The number of readers that must read and release the value
+                before we can write again.
+        """
+        if num_readers <= 0:
+            raise ValueError("``num_readers`` must be a positive integer.")
+
+        try:
+            serialized_value = self.worker.get_serialization_context().serialize(value)
+        except TypeError as e:
+            sio = io.StringIO()
+            ray.util.inspect_serializability(value, print_file=sio)
+            msg = (
+                "Could not serialize the put value "
+                f"{repr(value)}:\n"
+                f"{sio.getvalue()}"
+            )
+            raise TypeError(msg) from e
+
+        self.worker.core_worker.experimental_mutable_object_put_serialized(
+            serialized_value,
+            self._base_ref,
+            num_readers,
+        )
+
+    def begin_read(self) -> Any:
+        """
+        Read the latest value from the channel. This call will block until a
+        value is available to read.
+
+        Returns:
+            Any: The deserialized value.
+        """
+        values, _ = self.worker.get_objects(
+            [self._base_ref], _is_experimental_mutable_object=True
+        )
+        return values[0]
+
+    def end_read(self):
+        """
+        Signal to the writer that the channel is ready to write again.
+
+        If begin_read is not called first, then this call will block until a
+        value is written, then drop the value.
+        """
+        self.worker.core_worker.experimental_mutable_object_read_release(
+            [self._base_ref]
+        )
diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd
index bf6a6e810fc9d..00bf9b5f9d4e6 100644
--- a/python/ray/includes/libcoreworker.pxd
+++ b/python/ray/includes/libcoreworker.pxd
@@ -240,7 +240,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
                                   const CAddress &owner_address,
                                   shared_ptr[CBuffer] *data,
                                   c_bool created_by_worker)
-        CRayStatus WriteAcquireMutableObject(
+        CRayStatus ExperimentalMutableObjectWriteAcquire(
                                   const CObjectID &object_id,
                                   const shared_ptr[CBuffer] &metadata,
                                   uint64_t data_size,
@@ -251,8 +251,10 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
         CRayStatus SealExisting(const CObjectID &object_id, c_bool pin_object,
                                 const CObjectID &generator_id,
                                 const unique_ptr[CAddress] &owner_address)
-        CRayStatus GetRelease(const c_vector[CObjectID] &object_ids)
+        CRayStatus ExperimentalMutableObjectReadRelease(
+                    const c_vector[CObjectID] &object_ids)
         CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms,
+                       c_bool is_experimental_mutable_object,
                        c_vector[shared_ptr[CRayObject]] *results)
         CRayStatus GetIfLocal(
             const c_vector[CObjectID] &ids,
diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py
index fa7805a5b9de0..3795baaddf9fc 100644
--- a/python/ray/tests/test_accelerated_dag.py
+++ b/python/ray/tests/test_accelerated_dag.py
@@ -8,31 +8,34 @@
 
 import ray
 import ray.cluster_utils
+import ray.experimental.channel as ray_channel
 
 logger = logging.getLogger(__name__)
 
 
 def test_put_local_get(ray_start_regular):
-    ref = ray._create_channel(1000)
+    chan = ray_channel.Channel(1000)
 
     num_writes = 1000
     for i in range(num_writes):
         val = i.to_bytes(8, "little")
-        ray._write_channel(val, ref, num_readers=1)
-        assert ray.get(ref) == val
-        ray._end_read_channel(ref)
+        chan.write(val, num_readers=1)
+        assert chan.begin_read() == val
+        chan.end_read()
 
 
 def test_put_different_meta(ray_start_regular):
-    ref = ray._create_channel(1000)
+    chan = ray_channel.Channel(1000)
 
     def _test(val):
-        ray._write_channel(val, ref, num_readers=1)
+        chan.write(val, num_readers=1)
+
+        read_val = chan.begin_read()
         if isinstance(val, np.ndarray):
-            assert np.array_equal(ray.get(ref), val)
+            assert np.array_equal(read_val, val)
         else:
-            assert ray.get(ref) == val
-        ray._end_read_channel(ref)
+            assert read_val == val
+        chan.end_read()
 
     _test(b"hello")
     _test("hello")
@@ -47,25 +50,25 @@ def _test(val):
 
 @pytest.mark.parametrize("num_readers", [1, 4])
 def test_put_remote_get(ray_start_regular, num_readers):
-    ref = ray._create_channel(1000)
+    chan = ray_channel.Channel(1000)
 
     @ray.remote(num_cpus=0)
     class Reader:
         def __init__(self):
             pass
 
-        def read(self, ref, num_writes):
+        def read(self, chan, num_writes):
             for i in range(num_writes):
                 val = i.to_bytes(8, "little")
-                assert ray.get(ref[0]) == val
-                ray._end_read_channel(ref)
+                assert chan.begin_read() == val
+                chan.end_read()
 
     num_writes = 1000
     readers = [Reader.remote() for _ in range(num_readers)]
-    done = [reader.read.remote([ref], num_writes) for reader in readers]
+    done = [reader.read.remote(chan, num_writes) for reader in readers]
     for i in range(num_writes):
         val = i.to_bytes(8, "little")
-        ray._write_channel(val, ref, num_readers=num_readers)
+        chan.write(val, num_readers=num_readers)
 
     ray.get(done)
 
diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc
index 72f5b698d9234..d58640908c905 100644
--- a/src/ray/core_worker/core_worker.cc
+++ b/src/ray/core_worker/core_worker.cc
@@ -1219,7 +1219,7 @@ Status CoreWorker::Put(const RayObject &object,
 }
 
 Status CoreWorker::CreateOwnedAndIncrementLocalRef(
-    bool is_mutable,
+    bool is_experimental_mutable_object,
     const std::shared_ptr<Buffer> &metadata,
     const size_t data_size,
     const std::vector<ObjectID> &contained_object_ids,
@@ -1295,7 +1295,7 @@ Status CoreWorker::CreateOwnedAndIncrementLocalRef(
                                               /* owner_address = */ real_owner_address,
                                               data,
                                               created_by_worker,
-                                              is_mutable);
+                                              is_experimental_mutable_object);
     }
     if (!status.ok()) {
       RemoveLocalReference(*object_id);
@@ -1326,12 +1326,13 @@ Status CoreWorker::CreateExisting(const std::shared_ptr<Buffer> &metadata,
   }
 }
 
-Status CoreWorker::WriteAcquireMutableObject(const ObjectID &object_id,
-                                             const std::shared_ptr<Buffer> &metadata,
-                                             uint64_t data_size,
-                                             int64_t num_readers,
-                                             std::shared_ptr<Buffer> *data) {
-  return plasma_store_provider_->WriteAcquireMutableObject(
+Status CoreWorker::ExperimentalMutableObjectWriteAcquire(
+    const ObjectID &object_id,
+    const std::shared_ptr<Buffer> &metadata,
+    uint64_t data_size,
+    int64_t num_readers,
+    std::shared_ptr<Buffer> *data) {
+  return plasma_store_provider_->ExperimentalMutableObjectWriteAcquire(
       object_id, metadata, data_size, num_readers, data);
 }
 
@@ -1378,13 +1379,15 @@ Status CoreWorker::SealExisting(const ObjectID &object_id,
   return Status::OK();
 }
 
-Status CoreWorker::GetRelease(const std::vector<ObjectID> &object_ids) {
+Status CoreWorker::ExperimentalMutableObjectReadRelease(
+    const std::vector<ObjectID> &object_ids) {
   RAY_CHECK(object_ids.size() == 1);
-  return plasma_store_provider_->GetRelease(object_ids[0]);
+  return plasma_store_provider_->ExperimentalMutableObjectReadRelease(object_ids[0]);
 }
 
 Status CoreWorker::Get(const std::vector<ObjectID> &ids,
                        const int64_t timeout_ms,
+                       bool is_experimental_mutable_object,
                        std::vector<std::shared_ptr<RayObject>> *results) {
   std::unique_ptr<ScopedTaskMetricSetter> state = nullptr;
   if (options_.worker_type == WorkerType::WORKER) {
@@ -1452,6 +1455,7 @@ Status CoreWorker::Get(const std::vector<ObjectID> &ids,
     RAY_LOG(DEBUG) << "Plasma GET timeout " << local_timeout_ms;
     RAY_RETURN_NOT_OK(plasma_store_provider_->Get(plasma_object_ids,
                                                   local_timeout_ms,
+                                                  is_experimental_mutable_object,
                                                   worker_context_,
                                                   &result_map,
                                                   &got_exception));
@@ -2904,8 +2908,12 @@ bool CoreWorker::PinExistingReturnObject(const ObjectID &return_id,
   reference_counter_->AddLocalReference(return_id, "<temporary (pin return object)>");
   reference_counter_->AddBorrowedObject(return_id, ObjectID::Nil(), owner_address);
 
-  auto status = plasma_store_provider_->Get(
-      {return_id}, 0, worker_context_, &result_map, &got_exception);
+  auto status = plasma_store_provider_->Get({return_id},
+                                            0,
+                                            /*is_experimental_mutable_object=*/false,
+                                            worker_context_,
+                                            &result_map,
+                                            &got_exception);
   // Remove the temporary ref.
   RemoveLocalReference(return_id);
 
@@ -3168,8 +3176,13 @@ Status CoreWorker::GetAndPinArgsForExecutor(const TaskSpecification &task,
     RAY_RETURN_NOT_OK(
         memory_store_->Get(by_ref_ids, -1, worker_context_, &result_map, &got_exception));
   } else {
-    RAY_RETURN_NOT_OK(plasma_store_provider_->Get(
-        by_ref_ids, -1, worker_context_, &result_map, &got_exception));
+    RAY_RETURN_NOT_OK(
+        plasma_store_provider_->Get(by_ref_ids,
+                                    -1,
+                                    /*is_experimental_mutable_object=*/false,
+                                    worker_context_,
+                                    &result_map,
+                                    &got_exception));
   }
   for (const auto &it : result_map) {
     for (size_t idx : by_ref_indices[it.first]) {
@@ -4163,7 +4176,11 @@ void CoreWorker::PlasmaCallback(SetResultCallback success,
   bool object_is_local = false;
   if (Contains(object_id, &object_is_local).ok() && object_is_local) {
     std::vector<std::shared_ptr<RayObject>> vec;
-    if (Get(std::vector<ObjectID>{object_id}, 0, &vec).ok()) {
+    if (Get(std::vector<ObjectID>{object_id},
+            0,
+            /*is_experimental_mutable_object=*/false,
+            &vec)
+            .ok()) {
       RAY_CHECK(vec.size() > 0)
           << "Failed to get local object but Raylet notified object is local.";
       return success(vec.front(), object_id, py_future);
diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h
index 3db71661d8695..ea01d202fba75 100644
--- a/src/ray/core_worker/core_worker.h
+++ b/src/ray/core_worker/core_worker.h
@@ -602,6 +602,10 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
   /// ensure that they decrement the ref count once the returned ObjectRef has
   /// gone out of scope.
   ///
+  /// \param[in] is_experimental_mutable_object Whether this object is an
+  /// experimental mutable object. If true, then the returned object buffer
+  /// will not be available to read until the caller Seals and then writes
+  /// again.
   /// \param[in] metadata Metadata of the object to be written.
   /// \param[in] data_size Size of the object to be written.
   /// \param[in] contained_object_ids The IDs serialized in this object.
@@ -614,7 +618,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
   /// small.
   /// \return Status.
   Status CreateOwnedAndIncrementLocalRef(
-      bool is_mutable,
+      bool is_experimental_mutable_object,
       const std::shared_ptr<Buffer> &metadata,
       const size_t data_size,
       const std::vector<ObjectID> &contained_object_ids,
@@ -643,12 +647,6 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
                         std::shared_ptr<Buffer> *data,
                         bool created_by_worker);
 
-  Status WriteAcquireMutableObject(const ObjectID &object_id,
-                                   const std::shared_ptr<Buffer> &metadata,
-                                   uint64_t data_size,
-                                   int64_t num_readers,
-                                   std::shared_ptr<Buffer> *data);
-
   /// Finalize placing an object into the object store. This should be called after
   /// a corresponding `CreateOwned()` call and then writing into the returned buffer.
   ///
@@ -682,7 +680,31 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
                       const ObjectID &generator_id = ObjectID::Nil(),
                       const std::unique_ptr<rpc::Address> &owner_address = nullptr);
 
-  Status GetRelease(const std::vector<ObjectID> &object_ids);
+  /// Experimental method for mutable objects. Acquires a write lock on the
+  /// object that prevents readers from reading until we are done writing. Does
+  /// not protect against concurrent writers.
+  ///
+  /// \param[in] object_id The ID of the object.
+  /// \param[in] metadata The metadata of the object. This overwrites the
+  /// current metadata.
+  /// \param[in] data_size The size of the object to write. This overwrites the
+  /// current data size.
+  /// \param[in] num_readers The number of readers that must read and release
+  /// the object before the caller can write again.
+  /// \param[out] data The mutable object buffer in plasma that can be written to.
+  Status ExperimentalMutableObjectWriteAcquire(const ObjectID &object_id,
+                                               const std::shared_ptr<Buffer> &metadata,
+                                               uint64_t data_size,
+                                               int64_t num_readers,
+                                               std::shared_ptr<Buffer> *data);
+
+  /// Experimental method for mutable objects. Releases the objects, allowing them
+  /// to be written again. If the caller did not previously Get the objects,
+  /// then this first blocks until the latest value is available to read, then
+  /// releases the value.
+  ///
+  /// \param[in] object_ids The IDs of the objects.
+  Status ExperimentalMutableObjectReadRelease(const std::vector<ObjectID> &object_ids);
 
   /// Get a list of objects from the object store. Objects that failed to be retrieved
   /// will be returned as nullptrs.
@@ -693,6 +715,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
   /// \return Status.
   Status Get(const std::vector<ObjectID> &ids,
              const int64_t timeout_ms,
+             bool is_experimental_mutable_object,
              std::vector<std::shared_ptr<RayObject>> *results);
 
   /// Get objects directly from the local plasma store, without waiting for the
diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.cc b/src/ray/core_worker/store_provider/plasma_store_provider.cc
index 30ae14daef662..6bfd686a987f1 100644
--- a/src/ray/core_worker/store_provider/plasma_store_provider.cc
+++ b/src/ray/core_worker/store_provider/plasma_store_provider.cc
@@ -108,18 +108,19 @@ Status CoreWorkerPlasmaStoreProvider::Put(const RayObject &object,
   return Status::OK();
 }
 
-Status CoreWorkerPlasmaStoreProvider::WriteAcquireMutableObject(
+Status CoreWorkerPlasmaStoreProvider::ExperimentalMutableObjectWriteAcquire(
     const ObjectID &object_id,
     const std::shared_ptr<Buffer> &metadata,
     uint64_t data_size,
     int64_t num_readers,
     std::shared_ptr<Buffer> *data) {
-  return store_client_.WriteAcquireMutableObject(object_id,
-                                                 data_size,
-                                                 metadata ? metadata->Data() : nullptr,
-                                                 metadata ? metadata->Size() : 0,
-                                                 num_readers,
-                                                 data);
+  return store_client_.ExperimentalMutableObjectWriteAcquire(
+      object_id,
+      data_size,
+      metadata ? metadata->Data() : nullptr,
+      metadata ? metadata->Size() : 0,
+      num_readers,
+      data);
 }
 
 Status CoreWorkerPlasmaStoreProvider::Create(const std::shared_ptr<Buffer> &metadata,
@@ -181,19 +182,21 @@ Status CoreWorkerPlasmaStoreProvider::FetchAndGetFromPlasmaStore(
     absl::flat_hash_set<ObjectID> &remaining,
     const std::vector<ObjectID> &batch_ids,
     int64_t timeout_ms,
+    bool send_fetch_or_reconstruct_ipc,
     bool fetch_only,
     bool in_direct_call,
     const TaskID &task_id,
     absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
     bool *got_exception) {
-  const auto owner_addresses = reference_counter_->GetOwnerAddresses(batch_ids);
-  // TODO this IPC needs to be skipped in shared mode
-  //  RAY_RETURN_NOT_OK(
-  //      raylet_client_->FetchOrReconstruct(batch_ids,
-  //                                         owner_addresses,
-  //                                         fetch_only,
-  //                                         /*mark_worker_blocked*/ !in_direct_call,
-  //                                         task_id));
+  if (send_fetch_or_reconstruct_ipc) {
+    const auto owner_addresses = reference_counter_->GetOwnerAddresses(batch_ids);
+    RAY_RETURN_NOT_OK(
+        raylet_client_->FetchOrReconstruct(batch_ids,
+                                           owner_addresses,
+                                           fetch_only,
+                                           /*mark_worker_blocked*/ !in_direct_call,
+                                           task_id));
+  }
 
   std::vector<plasma::ObjectBuffer> plasma_results;
   RAY_RETURN_NOT_OK(store_client_.Get(batch_ids,
@@ -232,8 +235,9 @@ Status CoreWorkerPlasmaStoreProvider::FetchAndGetFromPlasmaStore(
   return Status::OK();
 }
 
-Status CoreWorkerPlasmaStoreProvider::GetRelease(const ObjectID &object_id) {
-  return store_client_.GetRelease(object_id);
+Status CoreWorkerPlasmaStoreProvider::ExperimentalMutableObjectReadRelease(
+    const ObjectID &object_id) {
+  return store_client_.ExperimentalMutableObjectReadRelease(object_id);
 }
 
 Status CoreWorkerPlasmaStoreProvider::GetIfLocal(
@@ -287,6 +291,7 @@ Status UnblockIfNeeded(const std::shared_ptr<raylet::RayletClient> &client,
 Status CoreWorkerPlasmaStoreProvider::Get(
     const absl::flat_hash_set<ObjectID> &object_ids,
     int64_t timeout_ms,
+    bool is_experimental_mutable_object,
     const WorkerContext &ctx,
     absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
     bool *got_exception) {
@@ -302,14 +307,17 @@ Status CoreWorkerPlasmaStoreProvider::Get(
     for (int64_t i = start; i < batch_size && i < total_size; i++) {
       batch_ids.push_back(id_vector[start + i]);
     }
-    RAY_RETURN_NOT_OK(FetchAndGetFromPlasmaStore(remaining,
-                                                 batch_ids,
-                                                 /*timeout_ms=*/0,
-                                                 /*fetch_only=*/true,
-                                                 ctx.CurrentTaskIsDirectCall(),
-                                                 ctx.GetCurrentTaskID(),
-                                                 results,
-                                                 got_exception));
+    RAY_RETURN_NOT_OK(FetchAndGetFromPlasmaStore(
+        remaining,
+        batch_ids,
+        /*timeout_ms=*/0,
+        // Mutable objects must be local before ray.get.
+        /*send_fetch_or_reconstruct_ipc=*/!is_experimental_mutable_object,
+        /*fetch_only=*/true,
+        ctx.CurrentTaskIsDirectCall(),
+        ctx.GetCurrentTaskID(),
+        results,
+        got_exception));
   }
 
   // If all objects were fetched already, return. Note that we always need to
@@ -318,6 +326,8 @@ Status CoreWorkerPlasmaStoreProvider::Get(
     return UnblockIfNeeded(raylet_client_, ctx);
   }
 
+  RAY_CHECK(!is_experimental_mutable_object) << "Mutable objects must always be local";
+
   // If not all objects were successfully fetched, repeatedly call FetchOrReconstruct
   // and Get from the local object store in batches. This loop will run indefinitely
   // until the objects are all fetched if timeout is -1.
@@ -351,6 +361,7 @@ Status CoreWorkerPlasmaStoreProvider::Get(
     RAY_RETURN_NOT_OK(FetchAndGetFromPlasmaStore(remaining,
                                                  batch_ids,
                                                  batch_timeout,
+                                                 /*send_fetch_or_reconstruct_ipc=*/true,
                                                  /*fetch_only=*/false,
                                                  ctx.CurrentTaskIsDirectCall(),
                                                  ctx.GetCurrentTaskID(),
diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.h b/src/ray/core_worker/store_provider/plasma_store_provider.h
index 2c7242a02f4a1..fff93c48c2e4a 100644
--- a/src/ray/core_worker/store_provider/plasma_store_provider.h
+++ b/src/ray/core_worker/store_provider/plasma_store_provider.h
@@ -129,12 +129,6 @@ class CoreWorkerPlasmaStoreProvider {
                 bool created_by_worker,
                 bool is_mutable = false);
 
-  Status WriteAcquireMutableObject(const ObjectID &object_id,
-                                   const std::shared_ptr<Buffer> &metadata,
-                                   uint64_t data_size,
-                                   int64_t num_readers,
-                                   std::shared_ptr<Buffer> *data);
-
   /// Seal an object buffer created with Create().
   ///
   /// NOTE: The caller must subsequently call Release() to release the first reference to
@@ -154,12 +148,11 @@ class CoreWorkerPlasmaStoreProvider {
 
   Status Get(const absl::flat_hash_set<ObjectID> &object_ids,
              int64_t timeout_ms,
+             bool is_experimental_mutable_object,
              const WorkerContext &ctx,
              absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
              bool *got_exception);
 
-  Status GetRelease(const ObjectID &object_id);
-
   /// Get objects directly from the local plasma store, without waiting for the
   /// objects to be fetched from another node. This should only be used
   /// internally, never by user code.
@@ -189,6 +182,32 @@ class CoreWorkerPlasmaStoreProvider {
 
   std::string MemoryUsageString();
 
+  /// Experimental method for mutable objects. Acquires a write lock on the
+  /// object that prevents readers from reading until we are done writing. Does
+  /// not protect against concurrent writers.
+  ///
+  /// \param[in] object_id The ID of the object.
+  /// \param[in] metadata The metadata of the object. This overwrites the
+  /// current metadata.
+  /// \param[in] data_size The size of the object to write. This overwrites the
+  /// current data size.
+  /// \param[in] num_readers The number of readers that must read and release
+  /// the object before the caller can write again.
+  /// \param[out] data The mutable object buffer in plasma that can be written to.
+  Status ExperimentalMutableObjectWriteAcquire(const ObjectID &object_id,
+                                               const std::shared_ptr<Buffer> &metadata,
+                                               uint64_t data_size,
+                                               int64_t num_readers,
+                                               std::shared_ptr<Buffer> *data);
+
+  /// Experimental method for mutable objects. Releases the objects, allowing them
+  /// to be written again. If the caller did not previously Get the objects,
+  /// then this first blocks until the latest value is available to read, then
+  /// releases the value.
+  ///
+  /// \param[in] object_id The ID of the object.
+  Status ExperimentalMutableObjectReadRelease(const ObjectID &object_id);
+
  private:
   /// Ask the raylet to fetch a set of objects and then attempt to get them
   /// from the local plasma store. Successfully fetched objects will be removed
@@ -211,6 +230,7 @@ class CoreWorkerPlasmaStoreProvider {
       absl::flat_hash_set<ObjectID> &remaining,
       const std::vector<ObjectID> &batch_ids,
       int64_t timeout_ms,
+      bool send_fetch_or_reconstruct_ipc,
       bool fetch_only,
       bool in_direct_call_task,
       const TaskID &task_id,
diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc
index 7c95845012896..a362f784e41f7 100644
--- a/src/ray/object_manager/common.cc
+++ b/src/ray/object_manager/common.cc
@@ -37,15 +37,6 @@ void PlasmaObjectHeader::Destroy() {
   RAY_CHECK(sem_destroy(&rw_semaphore) == 0);
 }
 
-// Get the data size of the plasma object.
-// This has to be called only when reader lock is acquired
-// via ReadAcquire.
-uint64_t PlasmaObjectHeader::GetDataSize() const {
-  RAY_CHECK_NE(num_read_releases_remaining, 0)
-      << "ReadAcquire has to be called before calling this method.";
-  return data_size;
-}
-
 void PlasmaObjectHeader::WriteAcquire(int64_t write_version,
                                       uint64_t write_data_size,
                                       uint64_t write_metadata_size,
diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h
index 31083942a928d..6d9a8655dd2c9 100644
--- a/src/ray/object_manager/common.h
+++ b/src/ray/object_manager/common.h
@@ -39,6 +39,12 @@ using RestoreSpilledObjectCallback =
                        const std::string &,
                        std::function<void(const ray::Status &)>)>;
 
+/// A header for all plasma objects that is allocated and stored in shared
+/// memory. Therefore, it can be accessed across processes.
+///
+/// For normal immutable objects, no synchronization between processes is
+/// needed once the object has been Sealed. For experimental mutable objects,
+/// we use the header to synchronize between writer and readers.
 struct PlasmaObjectHeader {
   // Used to signal to the writer when all readers are done.
   sem_t rw_semaphore;
@@ -77,8 +83,10 @@ struct PlasmaObjectHeader {
 
   void Destroy();
 
-  // Blocks until there are no more readers.
-  // NOTE: Caller should ensure there is one writer at a time.
+  /// Blocks until all readers for the previous write have ReadRelease'd the value.
+  /// Caller must ensure there is one writer at a time. Caller must pass
+  /// consecutive versions on each new write, starting with write_version=1.
+  ///
   /// \param write_version The new version for write.
   /// \param data_size The new data size of the object.
   /// \param metadata_size The new metadata size of the object.
@@ -90,23 +98,28 @@ struct PlasmaObjectHeader {
 
   // Call after completing a write to signal that readers may read.
   // num_readers should be set before calling this.
+  ///
+  /// \param write_version The new version for write. This must match the
+  /// version previously passed to WriteAcquire.
   void WriteRelease(int64_t write_version);
 
   // Blocks until the given version or a more recent version is ready to read.
+  // If num_readers have already read this version, then this call will hang.
   //
   // \param read_version The minimum version to wait for.
   // \return The version that was read. This should be passed to ReadRelease
-  // when the reader is done.
+  // when the reader is done. Returns 0 if the object is a normal immutable
+  // object, meaning no ReadRelease is needed.
+  ///
+  /// \param read_version Read at least this version.
   int64_t ReadAcquire(int64_t read_version);
 
-  // Finishes the read. If all reads are done, signals to the
-  // writer. This is not necessary to call for objects that have
-  // num_readers=-1.
+  // Finishes the read. If all reads are done, signals to the writer. This is
+  // not necessary to call for objects that have num_readers=-1.
+  ///
+  /// \param read_version This must match the version previously passed in
+  /// ReadAcquire.
   void ReadRelease(int64_t read_version);
-
-  // Get the data size of the plasma object.
-  // The reader must first ReadAcquire.
-  uint64_t GetDataSize() const;
 };
 
 /// A struct that includes info about the object.
diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index d76f028c0d916..1741e72b1315d 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -95,14 +95,20 @@ struct ObjectInUseEntry {
   PlasmaObject object;
   /// A flag representing whether the object has been sealed.
   bool is_sealed;
+
+  /// The below fields are experimental and used to implement
+  /// ray.experimental.channel.
+  ///
+  /// Whether the object is mutable. Most objects are immutable and cannot be
+  /// written to after the initial Create and Seal call. Mutable objects are
+  /// used to implement ray.experimental.channel.
   bool is_mutable = false;
-  /// For shared objects only.
   /// The last version that we read. To read again, we must pass a newer
   /// version than this.
   int64_t next_version_to_read = 1;
   /// Whether we currently have a read lock on the object. If this is true,
   /// then it is safe to read the value of the object. For immutable objects,
-  /// this will always be true once the object has been sealed. For immutable
+  /// this will always be true once the object has been sealed. For mutable
   /// objects, ReadRelease resets this to false, and ReadAcquire resets to
   /// true.
   bool read_acquired = false;
@@ -150,12 +156,12 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
                               fb::ObjectSource source,
                               int device_num);
 
-  Status WriteAcquireMutableObject(const ObjectID &object_id,
-                                   int64_t data_size,
-                                   const uint8_t *metadata,
-                                   int64_t metadata_size,
-                                   int64_t num_readers,
-                                   std::shared_ptr<Buffer> *data);
+  Status ExperimentalMutableObjectWriteAcquire(const ObjectID &object_id,
+                                               int64_t data_size,
+                                               const uint8_t *metadata,
+                                               int64_t metadata_size,
+                                               int64_t num_readers,
+                                               std::shared_ptr<Buffer> *data);
 
   Status Get(const std::vector<ObjectID> &object_ids,
              int64_t timeout_ms,
@@ -170,7 +176,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
 
   void EnsureGetAcquired(std::unique_ptr<ObjectInUseEntry> &object_entry);
 
-  Status GetRelease(const ObjectID &object_id);
+  Status ExperimentalMutableObjectReadRelease(const ObjectID &object_id);
 
   Status Release(const ObjectID &object_id);
 
@@ -226,9 +232,11 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
 
   ray::PlasmaObjectHeader *GetPlasmaObjectHeader(const PlasmaObject &object) const;
 
-  void IncrementObjectCount(const ObjectID &object_id,
-                            const PlasmaObject *object,
-                            bool is_sealed);
+  void InsertObjectInUse(const ObjectID &object_id,
+                         std::unique_ptr<PlasmaObject> object,
+                         bool is_sealed);
+
+  void IncrementObjectCount(const ObjectID &object_id);
 
   /// The boost::asio IO context for the client.
   instrumented_io_context main_service_;
@@ -306,31 +314,28 @@ bool PlasmaClient::Impl::IsInUse(const ObjectID &object_id) {
   return (elem != objects_in_use_.end());
 }
 
-void PlasmaClient::Impl::IncrementObjectCount(const ObjectID &object_id,
-                                              const PlasmaObject *object,
-                                              bool is_sealed) {
+void PlasmaClient::Impl::InsertObjectInUse(const ObjectID &object_id,
+                                           std::unique_ptr<PlasmaObject> object,
+                                           bool is_sealed) {
+  auto inserted =
+      objects_in_use_.insert({object_id, std::make_unique<ObjectInUseEntry>()});
+  RAY_CHECK(inserted.second) << "Object already in use";
+  auto it = inserted.first;
+
+  // Add this object ID to the hash table of object IDs in use. The
+  // corresponding call to free happens in PlasmaClient::Release.
+  it->second->object = *object.release();
+  // Count starts at 1 to pin the object.
+  it->second->count = 1;
+  it->second->is_sealed = is_sealed;
+}
+
+void PlasmaClient::Impl::IncrementObjectCount(const ObjectID &object_id) {
   // Increment the count of the object to track the fact that it is being used.
   // The corresponding decrement should happen in PlasmaClient::Release.
-  auto elem = objects_in_use_.find(object_id);
-  ObjectInUseEntry *object_entry;
-  if (elem == objects_in_use_.end()) {
-    RAY_CHECK(object != nullptr);
-    // Add this object ID to the hash table of object IDs in use. The
-    // corresponding call to free happens in PlasmaClient::Release.
-    objects_in_use_[object_id] = std::make_unique<ObjectInUseEntry>();
-    objects_in_use_[object_id]->object = *object;
-    objects_in_use_[object_id]->count = 0;
-    objects_in_use_[object_id]->is_sealed = is_sealed;
-    object_entry = objects_in_use_[object_id].get();
-  } else {
-    object_entry = elem->second.get();
-    // TODO(swang): Nicer way to pin shared objects.
-    // RAY_CHECK(object_entry->count > 0);
-  }
-  // Increment the count of the number of instances of this object that are
-  // being used by this client. The corresponding decrement should happen in
-  // PlasmaClient::Release.
-  object_entry->count += 1;
+  auto object_entry = objects_in_use_.find(object_id);
+  RAY_CHECK(object_entry != objects_in_use_.end());
+  object_entry->second->count += 1;
 }
 
 Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id,
@@ -340,7 +345,7 @@ Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id,
   std::vector<uint8_t> buffer;
   RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaCreateReply, &buffer));
   ObjectID id;
-  PlasmaObject object;
+  auto object = std::make_unique<PlasmaObject>();
   MEMFD_TYPE store_fd;
   int64_t mmap_size;
 
@@ -349,7 +354,7 @@ Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id,
                                       buffer.size(),
                                       &id,
                                       retry_with_request_id,
-                                      &object,
+                                      object.get(),
                                       &store_fd,
                                       &mmap_size));
     if (*retry_with_request_id > 0) {
@@ -359,51 +364,50 @@ Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id,
   } else {
     uint64_t unused = 0;
     RAY_RETURN_NOT_OK(ReadCreateReply(
-        buffer.data(), buffer.size(), &id, &unused, &object, &store_fd, &mmap_size));
+        buffer.data(), buffer.size(), &id, &unused, object.get(), &store_fd, &mmap_size));
     RAY_CHECK(unused == 0);
   }
 
   // If the CreateReply included an error, then the store will not send a file
   // descriptor.
-  if (object.device_num == 0) {
+  if (object->device_num == 0) {
     // The metadata should come right after the data.
-    RAY_CHECK(object.metadata_offset == object.data_offset + object.data_size);
+    RAY_CHECK(object->metadata_offset == object->data_offset + object->data_size);
     RAY_LOG(DEBUG) << "GetStoreFdAndMmap " << store_fd.first << ", " << store_fd.second
                    << ", size " << mmap_size << " for object id " << id;
     *data = std::make_shared<PlasmaMutableBuffer>(
         shared_from_this(),
-        GetStoreFdAndMmap(store_fd, mmap_size) + object.data_offset,
-        object.data_size);
+        GetStoreFdAndMmap(store_fd, mmap_size) + object->data_offset,
+        object->data_size);
     // If plasma_create is being called from a transfer, then we will not copy the
     // metadata here. The metadata will be written along with the data streamed
     // from the transfer.
     if (metadata != NULL) {
       // Copy the metadata to the buffer.
-      memcpy((*data)->Data() + object.data_size, metadata, object.metadata_size);
+      memcpy((*data)->Data() + object->data_size, metadata, object->metadata_size);
     }
   } else {
     RAY_LOG(FATAL) << "GPU is not enabled.";
   }
 
-  // Increment the count of the number of instances of this object that this
-  // client is using. A call to PlasmaClient::Release is required to decrement
-  // this count. Cache the reference to the object.
-  IncrementObjectCount(object_id, &object, false);
-  // TODO(swang): Remove the second increment call.
+  // Add the object as in use. A call to PlasmaClient::Release is required to
+  // decrement the initial ref count of 1. Cache the reference to the object.
+  InsertObjectInUse(object_id, std::move(object), /*is_sealed=*/false);
   // We increment the count a second time (and the corresponding decrement will
   // happen in a PlasmaClient::Release call in plasma_seal) so even if the
   // buffer returned by PlasmaClient::Create goes out of scope, the object does
   // not get released before the call to PlasmaClient::Seal happens.
-  IncrementObjectCount(object_id, &object, false);
+  IncrementObjectCount(object_id);
   return Status::OK();
 }
 
-Status PlasmaClient::Impl::WriteAcquireMutableObject(const ObjectID &object_id,
-                                                     int64_t data_size,
-                                                     const uint8_t *metadata,
-                                                     int64_t metadata_size,
-                                                     int64_t num_readers,
-                                                     std::shared_ptr<Buffer> *data) {
+Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire(
+    const ObjectID &object_id,
+    int64_t data_size,
+    const uint8_t *metadata,
+    int64_t metadata_size,
+    int64_t num_readers,
+    std::shared_ptr<Buffer> *data) {
   std::unique_lock<std::recursive_mutex> guard(client_mutex_);
   auto object_entry = objects_in_use_.find(object_id);
   RAY_CHECK(object_entry != objects_in_use_.end());
@@ -590,7 +594,7 @@ Status PlasmaClient::Impl::GetBuffers(
       object_buffers[i].device_num = object->device_num;
       // Increment the count of the number of instances of this object that this
       // client is using. Cache the reference to the object.
-      IncrementObjectCount(object_ids[i], object, true);
+      IncrementObjectCount(object_ids[i]);
     }
   }
 
@@ -606,7 +610,7 @@ Status PlasmaClient::Impl::GetBuffers(
   RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaGetReply, &buffer));
   std::vector<ObjectID> received_object_ids(num_objects);
   std::vector<PlasmaObject> object_data(num_objects);
-  PlasmaObject *object;
+  auto object = std::make_unique<PlasmaObject>();
   std::vector<MEMFD_TYPE> store_fds;
   std::vector<int64_t> mmap_sizes;
   RAY_RETURN_NOT_OK(ReadGetReply(buffer.data(),
@@ -629,7 +633,7 @@ Status PlasmaClient::Impl::GetBuffers(
 
   for (int64_t i = 0; i < num_objects; ++i) {
     RAY_DCHECK(received_object_ids[i] == object_ids[i]);
-    object = &object_data[i];
+    *object = object_data[i];
     if (object_buffers[i].data) {
       // If the object was already in use by the client, then the store should
       // have returned it.
@@ -643,7 +647,7 @@ Status PlasmaClient::Impl::GetBuffers(
     if (object->data_size != -1) {
       // Increment the count of the number of instances of this object that this
       // client is using. Cache the reference to the object.
-      IncrementObjectCount(received_object_ids[i], object, true);
+      InsertObjectInUse(received_object_ids[i], std::move(object), /*is_sealed=*/true);
       auto &object_entry = objects_in_use_[received_object_ids[i]];
 
       // Wait for the object to become ready to read.
@@ -651,22 +655,25 @@ Status PlasmaClient::Impl::GetBuffers(
       EnsureGetAcquired(object_entry);
       std::shared_ptr<Buffer> physical_buf;
       RAY_LOG(DEBUG) << "Plasma Get " << received_object_ids[i]
-                     << ", data size: " << object->data_size
-                     << ", metadata size: " << object->metadata_size;
-      if (object->device_num == 0) {
-        uint8_t *data = LookupMmappedFile(object->store_fd);
+                     << ", data size: " << object_entry->object.data_size
+                     << ", metadata size: " << object_entry->object.metadata_size;
+      if (object_entry->object.device_num == 0) {
+        uint8_t *data = LookupMmappedFile(object_entry->object.store_fd);
         physical_buf = std::make_shared<SharedMemoryBuffer>(
-            data + object->data_offset, object->data_size + object->metadata_size);
+            data + object_entry->object.data_offset,
+            object_entry->object.data_size + object_entry->object.metadata_size);
       } else {
         RAY_LOG(FATAL) << "Arrow GPU library is not enabled.";
       }
       // Finish filling out the return values.
       physical_buf = wrap_buffer(object_ids[i], physical_buf);
       object_buffers[i].data =
-          SharedMemoryBuffer::Slice(physical_buf, 0, object->data_size);
-      object_buffers[i].metadata = SharedMemoryBuffer::Slice(
-          physical_buf, object->data_size, object->metadata_size);
-      object_buffers[i].device_num = object->device_num;
+          SharedMemoryBuffer::Slice(physical_buf, 0, object_entry->object.data_size);
+      object_buffers[i].metadata =
+          SharedMemoryBuffer::Slice(physical_buf,
+                                    object_entry->object.data_size,
+                                    object_entry->object.metadata_size);
+      object_buffers[i].device_num = object_entry->object.device_num;
     } else {
       // The object was not retrieved.  The caller can detect this condition
       // by checking the boolean value of the metadata/data buffers.
@@ -716,10 +723,10 @@ void PlasmaClient::Impl::EnsureGetAcquired(
     RAY_CHECK(object_entry->object.data_size + object_entry->object.metadata_size <=
               object_entry->object.allocated_size);
   }
-  return;
 }
 
-Status PlasmaClient::Impl::GetRelease(const ObjectID &object_id) {
+Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease(
+    const ObjectID &object_id) {
   RAY_LOG(DEBUG) << "Try to release Get for object " << object_id;
   std::unique_lock<std::recursive_mutex> guard(client_mutex_);
 
@@ -999,13 +1006,14 @@ Status PlasmaClient::Connect(const std::string &store_socket_name,
       store_socket_name, manager_socket_name, release_delay, num_retries);
 }
 
-Status PlasmaClient::WriteAcquireMutableObject(const ObjectID &object_id,
-                                               int64_t data_size,
-                                               const uint8_t *metadata,
-                                               int64_t metadata_size,
-                                               int64_t num_readers,
-                                               std::shared_ptr<Buffer> *data) {
-  return impl_->WriteAcquireMutableObject(
+Status PlasmaClient::ExperimentalMutableObjectWriteAcquire(
+    const ObjectID &object_id,
+    int64_t data_size,
+    const uint8_t *metadata,
+    int64_t metadata_size,
+    int64_t num_readers,
+    std::shared_ptr<Buffer> *data) {
+  return impl_->ExperimentalMutableObjectWriteAcquire(
       object_id, data_size, metadata, metadata_size, num_readers, data);
 }
 
@@ -1054,8 +1062,8 @@ Status PlasmaClient::Get(const std::vector<ObjectID> &object_ids,
   return impl_->Get(object_ids, timeout_ms, object_buffers, is_from_worker);
 }
 
-Status PlasmaClient::GetRelease(const ObjectID &object_id) {
-  return impl_->GetRelease(object_id);
+Status PlasmaClient::ExperimentalMutableObjectReadRelease(const ObjectID &object_id) {
+  return impl_->ExperimentalMutableObjectReadRelease(object_id);
 }
 
 Status PlasmaClient::Release(const ObjectID &object_id) {
diff --git a/src/ray/object_manager/plasma/client.h b/src/ray/object_manager/plasma/client.h
index 00c85cca3f11e..e3f1aa1b05e3f 100644
--- a/src/ray/object_manager/plasma/client.h
+++ b/src/ray/object_manager/plasma/client.h
@@ -82,7 +82,34 @@ class PlasmaClientInterface {
                      std::vector<ObjectBuffer> *object_buffers,
                      bool is_from_worker) = 0;
 
-  virtual Status GetRelease(const ObjectID &object_id) = 0;
+  /// Experimental method for mutable objects. Acquires a write lock on the
+  /// object that prevents readers from reading until we are done writing. Does
+  /// not protect against concurrent writers.
+  ///
+  /// \param[in] object_id The ID of the object.
+  /// \param[in] data_size The size of the object to write. This overwrites the
+  /// current data size.
+  /// \param[in] metadata A pointer to the object metadata buffer to copy. This
+  /// will overwrite the current metadata.
+  /// \param[in] metadata_size The number of bytes to copy from the metadata
+  /// pointer.
+  /// \param[in] num_readers The number of readers that must read and release
+  /// the object before the caller can write again.
+  /// \param[out] data The mutable object buffer in plasma that can be written to.
+  virtual Status ExperimentalMutableObjectWriteAcquire(const ObjectID &object_id,
+                                                       int64_t data_size,
+                                                       const uint8_t *metadata,
+                                                       int64_t metadata_size,
+                                                       int64_t num_readers,
+                                                       std::shared_ptr<Buffer> *data) = 0;
+
+  /// Experimental method for mutable objects. Releases the objects, allowing them
+  /// to be written again. If the caller did not previously Get the objects,
+  /// then this first blocks until the latest value is available to read, then
+  /// releases the value.
+  ///
+  /// \param[in] object_id The ID of the object.
+  virtual Status ExperimentalMutableObjectReadRelease(const ObjectID &object_id) = 0;
 
   /// Seal an object in the object store. The object will be immutable after
   /// this
@@ -137,13 +164,6 @@ class PlasmaClientInterface {
                                         plasma::flatbuf::ObjectSource source,
                                         int device_num = 0) = 0;
 
-  virtual Status WriteAcquireMutableObject(const ObjectID &object_id,
-                                           int64_t data_size,
-                                           const uint8_t *metadata,
-                                           int64_t metadata_size,
-                                           int64_t num_readers,
-                                           std::shared_ptr<Buffer> *data) = 0;
-
   /// Delete a list of objects from the object store. This currently assumes that the
   /// object is present, has been sealed and not used by another client. Otherwise,
   /// it is a no operation.
@@ -211,12 +231,12 @@ class PlasmaClient : public PlasmaClientInterface {
                                 plasma::flatbuf::ObjectSource source,
                                 int device_num = 0);
 
-  Status WriteAcquireMutableObject(const ObjectID &object_id,
-                                   int64_t data_size,
-                                   const uint8_t *metadata,
-                                   int64_t metadata_size,
-                                   int64_t num_readers,
-                                   std::shared_ptr<Buffer> *data);
+  Status ExperimentalMutableObjectWriteAcquire(const ObjectID &object_id,
+                                               int64_t data_size,
+                                               const uint8_t *metadata,
+                                               int64_t metadata_size,
+                                               int64_t num_readers,
+                                               std::shared_ptr<Buffer> *data);
 
   /// Create an object in the Plasma Store. Any metadata for this object must be
   /// be passed in when the object is created.
@@ -273,7 +293,7 @@ class PlasmaClient : public PlasmaClientInterface {
              std::vector<ObjectBuffer> *object_buffers,
              bool is_from_worker);
 
-  Status GetRelease(const ObjectID &object_id);
+  Status ExperimentalMutableObjectReadRelease(const ObjectID &object_id);
 
   /// Tell Plasma that the client no longer needs the object. This should be
   /// called after Get() or Create() when the client is done with the object.

From ea57894f405c757d8faa6ed6077aecfc7c9db0cf Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Fri, 1 Dec 2023 16:09:53 -0800
Subject: [PATCH 16/66] Test for errors, better error handling when too many
 readers

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/_private/ray_perf.py          | 10 +++--
 python/ray/experimental/channel.py       | 29 ++++++++------
 python/ray/tests/test_accelerated_dag.py | 49 ++++++++++++++++++++++++
 src/ray/object_manager/common.cc         | 44 ++++++++++++---------
 src/ray/object_manager/common.h          | 39 ++++++++++++-------
 src/ray/object_manager/plasma/client.cc  | 37 ++++++++++++++----
 6 files changed, 153 insertions(+), 55 deletions(-)

diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py
index dcc49d42b2926..330527957d675 100644
--- a/python/ray/_private/ray_perf.py
+++ b/python/ray/_private/ray_perf.py
@@ -296,9 +296,9 @@ def async_actor_multi():
 
     ray.init()
 
-    def put_channel_small(chans, num_readers=1, do_get=False, do_release=False):
+    def put_channel_small(chans, do_get=False, do_release=False):
         for chan in chans:
-            chan.write(b"0", num_readers=num_readers)
+            chan.write(b"0")
             if do_get:
                 chan.begin_read()
             if do_release:
@@ -337,14 +337,14 @@ def read(self, chans):
     n_cpu = multiprocessing.cpu_count() // 2
     print(f"Testing multiple readers/channels, n={n_cpu}")
 
-    chans = [ray_channel.Channel(1000)]
+    chans = [ray_channel.Channel(1000, num_readers=n_cpu)]
     readers = [ChannelReader.remote() for _ in range(n_cpu)]
     ray.get([reader.ready.remote() for reader in readers])
     for reader in readers:
         reader.read.remote(chans)
     results += timeit(
         "local put:n remote get, single channel calls",
-        lambda: put_channel_small(chans, num_readers=n_cpu),
+        lambda: put_channel_small(chans),
     )
     for reader in readers:
         ray.kill(reader)
@@ -369,6 +369,8 @@ def read(self, chans):
     for reader in readers:
         ray.kill(reader)
 
+    ray.shutdown()
+
     ############################
     # End of channel perf tests.
     ############################
diff --git a/python/ray/experimental/channel.py b/python/ray/experimental/channel.py
index 42f82e4aa3398..e8ef9ad085f79 100644
--- a/python/ray/experimental/channel.py
+++ b/python/ray/experimental/channel.py
@@ -52,7 +52,7 @@ class Channel:
     ray.wait.
     """
 
-    def __init__(self, buffer_size: Optional[int] = None):
+    def __init__(self, buffer_size: Optional[int] = None, num_readers: int = 1):
         """
         Create a channel that can be read and written by co-located Ray processes.
 
@@ -71,19 +71,20 @@ def __init__(self, buffer_size: Optional[int] = None):
         else:
             self._base_ref = _create_channel_ref(buffer_size)
 
-        self.worker = ray._private.worker.global_worker
-        self.worker.check_connected()
+        self._num_readers = num_readers
+        self._worker = ray._private.worker.global_worker
+        self._worker.check_connected()
 
     @staticmethod
-    def _from_base_ref(base_ref: "ray.ObjectRef") -> "Channel":
-        chan = Channel()
+    def _from_base_ref(base_ref: "ray.ObjectRef", num_readers: int) -> "Channel":
+        chan = Channel(num_readers=num_readers)
         chan._base_ref = base_ref
         return chan
 
     def __reduce__(self):
-        return self._from_base_ref, (self._base_ref,)
+        return self._from_base_ref, (self._base_ref, self._num_readers)
 
-    def write(self, value: Any, num_readers: int):
+    def write(self, value: Any, num_readers: Optional[int] = None):
         """
         Write a value to the channel.
 
@@ -96,11 +97,13 @@ def write(self, value: Any, num_readers: int):
             num_readers: The number of readers that must read and release the value
                 before we can write again.
         """
+        if num_readers is None:
+            num_readers = self._num_readers
         if num_readers <= 0:
             raise ValueError("``num_readers`` must be a positive integer.")
 
         try:
-            serialized_value = self.worker.get_serialization_context().serialize(value)
+            serialized_value = self._worker.get_serialization_context().serialize(value)
         except TypeError as e:
             sio = io.StringIO()
             ray.util.inspect_serializability(value, print_file=sio)
@@ -111,7 +114,7 @@ def write(self, value: Any, num_readers: int):
             )
             raise TypeError(msg) from e
 
-        self.worker.core_worker.experimental_mutable_object_put_serialized(
+        self._worker.core_worker.experimental_mutable_object_put_serialized(
             serialized_value,
             self._base_ref,
             num_readers,
@@ -122,10 +125,14 @@ def begin_read(self) -> Any:
         Read the latest value from the channel. This call will block until a
         value is available to read.
 
+        Subsequent calls to begin_read() will return the same value, until
+        end_read() is called. Then, the client must begin_read() again to get
+        the next value.
+
         Returns:
             Any: The deserialized value.
         """
-        values, _ = self.worker.get_objects(
+        values, _ = self._worker.get_objects(
             [self._base_ref], _is_experimental_mutable_object=True
         )
         return values[0]
@@ -137,6 +144,6 @@ def end_read(self):
         If begin_read is not called first, then this call will block until a
         value is written, then drop the value.
         """
-        self.worker.core_worker.experimental_mutable_object_read_release(
+        self._worker.core_worker.experimental_mutable_object_read_release(
             [self._base_ref]
         )
diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_accelerated_dag.py
index 3795baaddf9fc..20e88a101efaf 100644
--- a/python/ray/tests/test_accelerated_dag.py
+++ b/python/ray/tests/test_accelerated_dag.py
@@ -21,9 +21,57 @@ def test_put_local_get(ray_start_regular):
         val = i.to_bytes(8, "little")
         chan.write(val, num_readers=1)
         assert chan.begin_read() == val
+
+        # Begin read multiple times will return the same value.
+        assert chan.begin_read() == val
+
         chan.end_read()
 
 
+def test_errors(ray_start_regular):
+    @ray.remote
+    class Actor:
+        def make_chan(self, do_write=True):
+            self.chan = ray_channel.Channel(1000)
+            if do_write:
+                self.chan.write(b"hello", num_readers=1)
+            return self.chan
+
+    a = Actor.remote()
+    # Only original creator can write.
+    chan = ray.get(a.make_chan.remote(do_write=False))
+    with pytest.raises(ray.exceptions.RaySystemError):
+        chan.write(b"hi")
+
+    # Only original creator can write.
+    chan = ray.get(a.make_chan.remote(do_write=True))
+    assert chan.begin_read() == b"hello"
+    with pytest.raises(ray.exceptions.RaySystemError):
+        chan.write(b"hi")
+
+    # Multiple consecutive reads from the same process are fine.
+    chan = ray.get(a.make_chan.remote(do_write=True))
+    assert chan.begin_read() == b"hello"
+    assert chan.begin_read() == b"hello"
+    chan.end_read()
+
+    @ray.remote
+    class Reader:
+        def __init__(self):
+            pass
+
+        def read(self, chan):
+            return chan.begin_read()
+
+    # Multiple reads from n different processes, where n > num_readers, errors.
+    chan = ray.get(a.make_chan.remote(do_write=True))
+    readers = [Reader.remote(), Reader.remote()]
+    # At least 1 reader
+    with pytest.raises(ray.exceptions.RayTaskError) as exc_info:
+        ray.get([reader.read.remote(chan) for reader in readers])
+    assert "ray.exceptions.RaySystemError" in str(exc_info.value)
+
+
 def test_put_different_meta(ray_start_regular):
     chan = ray_channel.Channel(1000)
 
@@ -42,6 +90,7 @@ def _test(val):
     _test(1000)
     _test(np.random.rand(10))
 
+    # Cannot put a serialized value larger than the allocated buffer.
     with pytest.raises(ValueError):
         _test(np.random.rand(100))
 
diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc
index a362f784e41f7..35a21ce0e4654 100644
--- a/src/ray/object_manager/common.cc
+++ b/src/ray/object_manager/common.cc
@@ -56,6 +56,7 @@ void PlasmaObjectHeader::WriteAcquire(int64_t write_version,
       << ". Are you sure this is the only writer?";
 
   version = write_version;
+  is_sealed = false;
   data_size = write_data_size;
   metadata_size = write_metadata_size;
   num_readers = write_num_readers;
@@ -76,6 +77,7 @@ void PlasmaObjectHeader::WriteRelease(int64_t write_version) {
       << version << ". Are you sure this is the only writer?";
 
   version = write_version;
+  is_sealed = true;
   RAY_CHECK(num_readers != 0) << num_readers;
   num_read_acquires_remaining = num_readers;
   num_read_releases_remaining = num_readers;
@@ -87,30 +89,36 @@ void PlasmaObjectHeader::WriteRelease(int64_t write_version) {
   RAY_CHECK(pthread_cond_broadcast(&cond) == 0);
 }
 
-int64_t PlasmaObjectHeader::ReadAcquire(int64_t read_version) {
-  RAY_LOG(DEBUG) << "ReadAcquire waiting version " << read_version;
+bool PlasmaObjectHeader::ReadAcquire(int64_t version_to_read, int64_t *version_read) {
+  RAY_LOG(DEBUG) << "ReadAcquire waiting version " << version_to_read;
   RAY_CHECK(pthread_mutex_lock(&wr_mut) == 0);
-  RAY_LOG(DEBUG) << "ReadAcquire " << read_version;
+  RAY_LOG(DEBUG) << "ReadAcquire " << version_to_read;
   PrintPlasmaObjectHeader(this);
 
-  while (version < read_version || num_read_acquires_remaining == 0) {
+  // Wait for the requested version (or a more recent one) to be sealed.
+  while (version < version_to_read || !is_sealed) {
     RAY_CHECK(pthread_cond_wait(&cond, &wr_mut) == 0);
   }
 
-  if (version > read_version) {
-    RAY_LOG(WARNING) << "Version " << version << " already exceeds version to read "
-                     << read_version << ". May have missed earlier reads.";
-  }
-
-  if (num_readers != -1) {
-    num_read_acquires_remaining--;
-    RAY_CHECK(num_read_acquires_remaining >= 0)
-        << "readers acquired exceeds max readers " << num_readers;
-    // This object can only be read a constant number of times. Tell the caller
-    // which version was read.
-    read_version = version;
+  bool success = false;
+  if (num_readers == -1) {
+    // Object is a normal immutable object. Read succeeds.
+    *version_read = 0;
+    success = true;
   } else {
-    read_version = 0;
+    *version_read = version;
+    if (version == version_to_read && num_read_acquires_remaining > 0) {
+      // This object is at the right version and still has reads remaining. Read
+      // succeeds.
+      num_read_acquires_remaining--;
+      success = true;
+    } else if (version > version_to_read) {
+      RAY_LOG(WARNING) << "Version " << version << " already exceeds version to read "
+                       << version_to_read;
+    } else {
+      RAY_LOG(WARNING) << "Version " << version << " already has " << num_readers
+                       << "readers";
+    }
   }
 
   RAY_LOG(DEBUG) << "ReadAcquire done";
@@ -119,7 +127,7 @@ int64_t PlasmaObjectHeader::ReadAcquire(int64_t read_version) {
   RAY_CHECK(pthread_mutex_unlock(&wr_mut) == 0);
   // Signal to other readers that they may read.
   RAY_CHECK(pthread_cond_signal(&cond) == 0);
-  return read_version;
+  return success;
 }
 
 void PlasmaObjectHeader::ReadRelease(int64_t read_version) {
diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h
index 6d9a8655dd2c9..eea4ccd8eb7ba 100644
--- a/src/ray/object_manager/common.h
+++ b/src/ray/object_manager/common.h
@@ -57,6 +57,12 @@ struct PlasmaObjectHeader {
   // the first write and then should never be modified. For mutable objects,
   // each new write must increment the version before releasing to readers.
   int64_t version = 0;
+  // Indicates whether the current version has been written. is_sealed=false
+  // means that there is a writer who has WriteAcquire'd but not yet
+  // WriteRelease'd the current version. is_sealed=true means that `version`
+  // has been WriteRelease'd. A reader may read the actual object value if
+  // is_sealed=true and num_read_acquires_remaining != 0.
+  bool is_sealed = false;
   // The total number of reads allowed before the writer can write again. This
   // value should be set by the writer before releasing to readers.
   // For immutable objects, this is set to -1 and infinite reads are allowed.
@@ -66,6 +72,10 @@ struct PlasmaObjectHeader {
   // objects, readers must ensure this is > 0 and decrement before they read.
   // Once this value reaches 0, no more readers are allowed until the writer
   // writes a new version.
+  // NOTE(swang): Technically we do not need this because
+  // num_read_releases_remaining protects against too many readers. However,
+  // this allows us to throw an error as soon as the n+1-th reader begins,
+  // instead of waiting to error until the n+1-th reader is done reading.
   int64_t num_read_acquires_remaining = 0;
   // The number of readers who must release the current version before a new
   // version can be written. For mutable objects, readers must decrement this
@@ -79,13 +89,15 @@ struct PlasmaObjectHeader {
   uint64_t data_size = 0;
   uint64_t metadata_size = 0;
 
+  /// Setup synchronization primitives.
   void Init();
 
+  /// Destroy synchronization primitives.
   void Destroy();
 
-  /// Blocks until all readers for the previous write have ReadRelease'd the value.
-  /// Caller must ensure there is one writer at a time. Caller must pass
-  /// consecutive versions on each new write, starting with write_version=1.
+  /// Blocks until all readers for the previous write have ReadRelease'd the
+  /// value. Protects against concurrent writers. Caller must pass consecutive
+  /// versions on each new write, starting with write_version=1.
   ///
   /// \param write_version The new version for write.
   /// \param data_size The new data size of the object.
@@ -96,23 +108,22 @@ struct PlasmaObjectHeader {
                     uint64_t metadata_size,
                     int64_t num_readers);
 
-  // Call after completing a write to signal that readers may read.
-  // num_readers should be set before calling this.
+  /// Call after completing a write to signal that readers may read.
+  /// num_readers should be set before calling this.
   ///
   /// \param write_version The new version for write. This must match the
   /// version previously passed to WriteAcquire.
   void WriteRelease(int64_t write_version);
 
-  // Blocks until the given version or a more recent version is ready to read.
-  // If num_readers have already read this version, then this call will hang.
+  // Blocks until the given version is ready to read. Returns false if the
+  // maximum number of readers have already read the requested version.
   //
-  // \param read_version The minimum version to wait for.
-  // \return The version that was read. This should be passed to ReadRelease
-  // when the reader is done. Returns 0 if the object is a normal immutable
-  // object, meaning no ReadRelease is needed.
-  ///
-  /// \param read_version Read at least this version.
-  int64_t ReadAcquire(int64_t read_version);
+  // \param[in] read_version The version to read.
+  // \param[out] version_read For normal immutable objects, this will be set to
+  // 0. Otherwise, the current version.
+  // \return success Whether the correct version was read and there were still
+  // reads remaining.
+  bool ReadAcquire(int64_t version_to_read, int64_t *version_read);
 
   // Finishes the read. If all reads are done, signals to the writer. This is
   // not necessary to call for objects that have num_readers=-1.
diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index 1741e72b1315d..f566392368ab0 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -103,6 +103,9 @@ struct ObjectInUseEntry {
   /// written to after the initial Create and Seal call. Mutable objects are
   /// used to implement ray.experimental.channel.
   bool is_mutable = false;
+  /// Whether we are the writer. For now, only the original creator of the
+  /// mutable object may write to it.
+  bool is_writer = false;
   /// The last version that we read. To read again, we must pass a newer
   /// version than this.
   int64_t next_version_to_read = 1;
@@ -174,7 +177,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
              ObjectBuffer *object_buffers,
              bool is_from_worker);
 
-  void EnsureGetAcquired(std::unique_ptr<ObjectInUseEntry> &object_entry);
+  Status EnsureGetAcquired(std::unique_ptr<ObjectInUseEntry> &object_entry);
 
   Status ExperimentalMutableObjectReadRelease(const ObjectID &object_id);
 
@@ -410,6 +413,14 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire(
     std::shared_ptr<Buffer> *data) {
   std::unique_lock<std::recursive_mutex> guard(client_mutex_);
   auto object_entry = objects_in_use_.find(object_id);
+  if (object_entry == objects_in_use_.end()) {
+    return Status::Invalid(
+        "Plasma buffer for mutable object not in scope. Are you sure you're the writer?");
+  }
+  if (!object_entry->second->is_writer) {
+    return Status::Invalid(
+        "Mutable objects can only be written by the original creator process.");
+  }
   RAY_CHECK(object_entry != objects_in_use_.end());
 
   auto &entry = object_entry->second;
@@ -493,7 +504,9 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
     entry->is_mutable = is_mutable;
 
     auto plasma_header = GetPlasmaObjectHeader(entry->object);
-    if (!entry->is_mutable) {
+    if (entry->is_mutable) {
+      entry->is_writer = true;
+    } else {
       // The first creation's version is always 1.
       RAY_CHECK(entry->next_version_to_write == 1);
       // The corresponding WriteRelease takes place in Seal.
@@ -571,7 +584,7 @@ Status PlasmaClient::Impl::GetBuffers(
       all_present = false;
     } else {
       // Wait for the object to become ready to read.
-      EnsureGetAcquired(object_entry->second);
+      RAY_RETURN_NOT_OK(EnsureGetAcquired(object_entry->second));
 
       PlasmaObject *object = &object_entry->second->object;
 
@@ -652,7 +665,7 @@ Status PlasmaClient::Impl::GetBuffers(
 
       // Wait for the object to become ready to read.
       RAY_CHECK(!object_entry->read_acquired);
-      EnsureGetAcquired(object_entry);
+      RAY_RETURN_NOT_OK(EnsureGetAcquired(object_entry));
       std::shared_ptr<Buffer> physical_buf;
       RAY_LOG(DEBUG) << "Plasma Get " << received_object_ids[i]
                      << ", data size: " << object_entry->object.data_size
@@ -700,15 +713,22 @@ Status PlasmaClient::Impl::Get(const std::vector<ObjectID> &object_ids,
       &object_ids[0], num_objects, timeout_ms, wrap_buffer, &(*out)[0], is_from_worker);
 }
 
-void PlasmaClient::Impl::EnsureGetAcquired(
+Status PlasmaClient::Impl::EnsureGetAcquired(
     std::unique_ptr<ObjectInUseEntry> &object_entry) {
   PlasmaObject *object = &object_entry->object;
   auto plasma_header = GetPlasmaObjectHeader(*object);
   if (object_entry->read_acquired) {
-    return;
+    return Status::OK();
+  }
+
+  int64_t version_read = 0;
+  bool success =
+      plasma_header->ReadAcquire(object_entry->next_version_to_read, &version_read);
+  if (!success) {
+    return Status::Invalid(
+        "Reader missed a value. Are you sure there are num_readers many readers?");
   }
 
-  int64_t version_read = plasma_header->ReadAcquire(object_entry->next_version_to_read);
   object_entry->read_acquired = true;
   if (version_read > 0) {
     object_entry->is_mutable = true;
@@ -723,6 +743,7 @@ void PlasmaClient::Impl::EnsureGetAcquired(
     RAY_CHECK(object_entry->object.data_size + object_entry->object.metadata_size <=
               object_entry->object.allocated_size);
   }
+  return Status::OK();
 }
 
 Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease(
@@ -745,7 +766,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease(
         "ray.release() called on an object that is not mutable");
   }
 
-  EnsureGetAcquired(entry);
+  RAY_RETURN_NOT_OK(EnsureGetAcquired(entry));
   RAY_LOG(DEBUG) << "Release shared object " << object_id;
   auto plasma_header = GetPlasmaObjectHeader(entry->object);
   plasma_header->ReadRelease(entry->next_version_to_read);

From 5bbf37947291473714e92b90099a60dd4a925848 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Fri, 1 Dec 2023 16:11:01 -0800
Subject: [PATCH 17/66] remove unneeded

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 BUILD.bazel | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/BUILD.bazel b/BUILD.bazel
index 1f8ff15b53798..42cc658621795 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -402,9 +402,6 @@ ray_cc_library(
         ":plasma_client",
         "//src/ray/common:network",
         ":stats_lib",
-        "@boost//:asio",
-        "@boost//:context",
-        "@boost//:coroutine",
     ],
 )
 

From 1e16e09cdeea87baaf0daacee515d4e58d74bb29 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Fri, 1 Dec 2023 17:08:39 -0800
Subject: [PATCH 18/66] java build

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 .../lib/java/io_ray_runtime_object_NativeObjectStore.cc     | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.cc b/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.cc
index 955b46f746e96..72027ff27af52 100644
--- a/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.cc
+++ b/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.cc
@@ -42,6 +42,7 @@ Status PutSerializedObject(JNIEnv *env,
       nested_ids.push_back(ObjectID::FromBinary(ref.object_id()));
     }
     status = CoreWorkerProcess::GetCoreWorker().CreateOwnedAndIncrementLocalRef(
+        /*is_experimental_mutable_object=*/false,
         native_ray_object->GetMetadata(),
         data_size,
         nested_ids,
@@ -128,7 +129,10 @@ JNIEXPORT jobject JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeGet
   });
   std::vector<std::shared_ptr<RayObject>> results;
   auto status =
-      CoreWorkerProcess::GetCoreWorker().Get(object_ids, (int64_t)timeoutMs, &results);
+      CoreWorkerProcess::GetCoreWorker().Get(object_ids,
+                                             (int64_t)timeoutMs,
+                                             /*is_experimental_mutable_object=*/false,
+                                             &results);
   THROW_EXCEPTION_AND_RETURN_IF_NOT_OK(env, status, nullptr);
   return NativeVectorToJavaList<std::shared_ptr<RayObject>>(
       env, results, NativeRayObjectToJavaNativeRayObject);

From 580b3ad3d02ac7ca2c7f4efa42e15e7e6842b12c Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Fri, 1 Dec 2023 17:14:30 -0800
Subject: [PATCH 19/66] rename

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/tests/{test_accelerated_dag.py => test_channel.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename python/ray/tests/{test_accelerated_dag.py => test_channel.py} (100%)

diff --git a/python/ray/tests/test_accelerated_dag.py b/python/ray/tests/test_channel.py
similarity index 100%
rename from python/ray/tests/test_accelerated_dag.py
rename to python/ray/tests/test_channel.py

From bdfbb8afe81d8e0ea9f03f0a18cc229452507fef Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Fri, 1 Dec 2023 21:45:49 -0800
Subject: [PATCH 20/66] tmp

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/compiled_dag_node.py | 293 ++++++++++++++++++++++++++++
 python/ray/dag/dag_node.py          |  48 ++++-
 2 files changed, 335 insertions(+), 6 deletions(-)
 create mode 100644 python/ray/dag/compiled_dag_node.py

diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
new file mode 100644
index 0000000000000..88b46e3ef16ba
--- /dev/null
+++ b/python/ray/dag/compiled_dag_node.py
@@ -0,0 +1,293 @@
+import time
+import threading
+from typing import List
+from collections import defaultdict
+
+import ray
+from ray.exceptions import RayTaskError, TaskCancelledError
+
+
+MAX_BUFFER_SIZE = int(100 * 1e6)  # 100MB
+
+
+def allocate_shared_output_buffer(buffer_size_bytes: int = MAX_BUFFER_SIZE):
+    assert isinstance(MAX_BUFFER_SIZE, int)
+    ref = ray.put(b"0" * buffer_size_bytes, max_readers=1)
+    # TODO(swang): Sleep to make sure that the object store sees the Seal. Should
+    # replace this with a better call to put reusable objects, and have the object
+    # store ReadRelease.
+    time.sleep(1)
+    ray.release(ref)
+    return ref
+
+
+def do_allocate_shared_output_buffer(self, buffer_size_bytes: int = MAX_BUFFER_SIZE):
+    self._output_ref = allocate_shared_output_buffer(buffer_size_bytes)
+    return self._output_ref
+
+
+def do_exec_compiled_task(
+    self,
+    input_refs: List[ray.ObjectRef],
+    actor_method_name: str,
+    output_max_readers: int,
+):
+    try:
+        self._input_refs = input_refs
+        method = getattr(self, actor_method_name)
+        while True:
+            inputs = ray.get(input_refs)
+            output_val = method(*inputs)
+            ray.worker.global_worker.put_object(
+                output_val,
+                object_ref=self._output_ref,
+                max_readers=output_max_readers,
+            )
+            for input_ref in input_refs:
+                ray.release(input_ref)
+    except Exception as e:
+        print("Task aborted", e)
+        raise
+
+
+def do_cancel_compiled_task(self):
+    input_refs = self._input_refs
+    e = RayTaskError(
+        function_name="do_exec_compiled_task",
+        traceback_str="",
+        cause=TaskCancelledError())
+    for input_ref in self._input_refs:
+        print("Putting cancellation token", input_ref)
+        try:
+            ray.worker.global_worker.put_object(
+                e,
+                object_ref=input_ref,
+                max_readers=1,
+                try_wait=True,
+            )
+        except Exception as e:
+            if "write acquire failed" in str(e):
+                pass
+            else:
+                raise
+
+
+class CompiledTask:
+    """Wraps the normal Ray DAGNode with some metadata."""
+
+    def __init__(self, idx, dag_node: "DAGNode"):
+        self.idx = idx
+        self.dag_node = dag_node
+
+        self.args = []
+        self.dependent_node_idxs = []
+        self.output_ref = None
+
+    @property
+    def max_readers(self):
+        return len(self.dependent_node_idxs)
+
+    def __str__(self):
+        return f"""
+Node: {self.dag_node}
+Arguments: {self.args}
+Output: {self.output_ref}
+"""
+
+
+class CompiledDAG:
+    def __init__(self):
+        # idx -> CompiledTask.
+        self.idx_to_task = {}
+        # DAGNode -> idx.
+        self.dag_node_to_idx = {}
+        # idx counter.
+        self.counter = 0
+
+        self.input_task_idx = None
+        self.output_task_idx = None
+        self.node_idx_to_output_refs = {}
+
+        # Cached.
+        self.dag_input_ref = None
+        self.dag_input_max_readers = None
+        self.dag_output_refs = None
+        self.worker_task_refs = []
+        self.actor_refs = set()
+
+    def add_node(self, node):
+        idx = self.counter
+        self.idx_to_task[idx] = CompiledTask(idx, node)
+        self.dag_node_to_idx[node] = idx
+        self.counter += 1
+
+    def preprocess(self):
+        from ray.dag import DAGNode, InputNode, OutputNode
+
+        for idx, task in self.idx_to_task.items():
+            task.args = task.dag_node.get_args()
+            for arg in task.args:
+                if isinstance(arg, DAGNode):
+                    arg_idx = self.dag_node_to_idx[arg]
+                    self.idx_to_task[arg_idx].dependent_node_idxs.append(idx)
+            if isinstance(task.dag_node, InputNode):
+                assert self.input_task_idx is None, "more than one InputNode found"
+                self.input_task_idx = idx
+        # TODO: Support no-input DAGs (use an empty object to signal).
+        assert (
+            self.input_task_idx is not None
+        ), "no InputNode found, require exactly one"
+
+        for idx, task in self.idx_to_task.items():
+            if len(task.dependent_node_idxs) == 0:
+                assert (
+                    self.output_task_idx is None
+                ), "More than one output node found, make sure only one node has 0 dependent tasks"
+                self.output_task_idx = idx
+
+    def compiled(self):
+        from ray.dag import DAGNode, InputNode, OutputNode, ClassMethodNode
+
+        if self.dag_input_ref is not None and self.dag_output_refs is not None:
+            # Driver should ray.put on input, ray.get/release on output
+            return (
+                self.dag_input_ref,
+                self.dag_input_max_readers,
+                self.dag_output_refs,
+                self.monitor,
+            )
+
+        queue = [self.input_task_idx]
+        visited = set()
+        # Create output buffers
+        while queue:
+            cur_idx = queue.pop(0)
+            if cur_idx in visited:
+                continue
+            visited.add(cur_idx)
+
+            task = self.idx_to_task[cur_idx]
+            dependent_node_idxs = task.dependent_node_idxs
+
+            # Create an output buffer on the actor.
+            assert task.output_ref is None
+            if isinstance(task.dag_node, ClassMethodNode):
+                fn = task.dag_node._get_remote_method("__ray_apply__")
+                task.output_ref = ray.get(fn.remote(do_allocate_shared_output_buffer))
+                self.actor_refs.add(task.dag_node._get_actor())
+            elif isinstance(task.dag_node, InputNode):
+                task.output_ref = allocate_shared_output_buffer()
+            else:
+                assert isinstance(task.dag_node, OutputNode)
+
+            for idx in task.dependent_node_idxs:
+                queue.append(idx)
+
+        output_node = self.idx_to_task[self.output_task_idx].dag_node
+        # TODO: Add an OutputNode to the end of the DAG if
+        # it's not already there.
+        assert isinstance(output_node, OutputNode)
+
+        work_refs = []
+        for node_idx, task in self.idx_to_task.items():
+            if node_idx == self.input_task_idx:
+                # We don't need to assign an actual task for the input node.
+                continue
+
+            if node_idx == self.output_task_idx:
+                # We don't need to assign an actual task for the input node.
+                continue
+
+            resolved_args = []
+            for arg in task.args:
+                # TODO(swang): Support non-ObjectRef args.
+                assert isinstance(arg, DAGNode)
+                arg_idx = self.dag_node_to_idx[arg]
+                arg_buffer = self.idx_to_task[arg_idx].output_ref
+                assert arg_buffer is not None
+                resolved_args.append(arg_buffer)
+
+            # TODO: Assign the task with the correct input and output buffers.
+            worker_fn = task.dag_node._get_remote_method("__ray_apply__")
+            self.worker_task_refs.append(
+                worker_fn.options(concurrency_group="_ray_system").remote(
+                    do_exec_compiled_task,
+                    resolved_args,
+                    task.dag_node.get_method_name(),
+                    task.max_readers,
+                )
+            )
+
+        self.dag_input_ref = self.idx_to_task[self.input_task_idx].output_ref
+        self.dag_input_max_readers = self.idx_to_task[self.input_task_idx].max_readers
+
+        self.dag_output_refs = []
+        for output in self.idx_to_task[self.output_task_idx].args:
+            assert isinstance(output, DAGNode)
+            output_idx = self.dag_node_to_idx[output]
+            self.dag_output_refs.append(self.idx_to_task[output_idx].output_ref)
+
+        assert self.dag_input_ref
+        assert self.dag_output_refs
+        # Driver should ray.put on input, ray.get/release on output
+        self.monitor = self.monitor_failures()
+        return (self.dag_input_ref, self.dag_input_max_readers, self.dag_output_refs, self.monitor)
+
+    def monitor_failures(self):
+        outer = self
+
+        class Monitor(threading.Thread):
+            def __init__(self):
+                super().__init__(daemon=True)
+                self.in_destroy = False
+
+            def destroy(self):
+                if self.in_destroy:
+                    return
+                self.in_destroy = True
+                for actor in outer.actor_refs:
+                    print("Cancelling compiled worker on actor", actor)
+                    try:
+                        ray.get(actor.__ray_apply__.remote(do_cancel_compiled_task))
+                    except Exception as e:
+                        print("Error cancelling", e)
+                        pass
+
+            def run(self):
+                try:
+                    ray.get(outer.worker_task_refs)
+                except Exception as e:
+                    if self.in_destroy:
+                        return
+                    print("Worker task exception", e)
+                    for output_ref in outer.dag_output_refs:
+                        print("Putting error", output_ref)
+                        try:
+                            ray.worker.global_worker.put_object(
+                                e,
+                                object_ref=output_ref,
+                                max_readers=1,
+                                try_wait=True,
+                            )
+                        except Exception as f:
+                            if "write acquire failed" in str(f):
+                                pass
+                            else:
+                                raise
+                    self.destroy()
+
+        monitor = Monitor()
+        monitor.start()
+        return monitor
+
+
+def build_compiled_dag(dag: "DAGNode"):
+    compiled_dag = CompiledDAG()
+
+    def build_compiled_dag(node):
+        compiled_dag.add_node(node)
+        return node
+
+    dag.apply_recursive(build_compiled_dag)
+    compiled_dag.preprocess()
+    return compiled_dag
diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py
index 6041a12401855..5a1f2251438ac 100644
--- a/python/ray/dag/dag_node.py
+++ b/python/ray/dag/dag_node.py
@@ -16,6 +16,8 @@
 import uuid
 import asyncio
 
+from ray.dag.compiled_dag_node import build_compiled_dag
+
 T = TypeVar("T")
 
 
@@ -59,6 +61,8 @@ def __init__(
         # Cached values from last call to execute()
         self.cache_from_last_execute = {}
 
+        self._compiled_dag = None
+
     def get_args(self) -> Tuple[Any]:
         """Return the tuple of arguments for this node."""
 
@@ -103,8 +107,19 @@ async def get_object_refs_from_last_execute(self) -> Dict[str, Any]:
     def clear_cache(self):
         self.cache_from_last_execute = {}
 
+    def compiled(self) -> Tuple[ray.ObjectRef]:
+        if self._compiled_dag is None:
+            self._compiled_dag = build_compiled_dag(self)
+
+        return self._compiled_dag.compiled()
+
     def execute(
-        self, *args, _ray_cache_refs: bool = False, **kwargs
+        self,
+        *args,
+        _ray_cache_refs: bool = False,
+        _ray_cache_actors: bool = True,
+        compiled: bool = False,
+        **kwargs,
     ) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]:
         """Execute this DAG using the Ray default executor _execute_impl().
 
@@ -115,15 +130,33 @@ def execute(
                 - Serve handles for class nodes
                 - resolved values representing user input at runtime
         """
+        if compiled:
+            assert len(args) == 1, "Compiled DAGs support exactly one InputNode arg"
+            input_ref, input_max_readers, output_ref, _ = self.compiled()
+            ray.worker.global_worker.put_object(
+                args[0], object_ref=input_ref, max_readers=input_max_readers
+            )
+            return output_ref
 
         def executor(node):
             return node._execute_impl(*args, **kwargs)
 
-        result = self.apply_recursive(executor)
+        cache = {}
         if _ray_cache_refs:
+            cache = self.cache_from_last_execute
+        elif _ray_cache_actors:
+            for key, ref in self.cache_from_last_execute.items():
+                if isinstance(ref, ray.actor.ActorHandle):
+                    cache[key] = ref
+        result = self.apply_recursive(executor, cache=cache)
+        if _ray_cache_refs or _ray_cache_actors:
             self.cache_from_last_execute = executor.cache
         return result
 
+    def destroy_compiled_dag(self):
+        _, _, _, monitor = self.compiled()
+        monitor.destroy()
+
     def _get_toplevel_child_nodes(self) -> List["DAGNode"]:
         """Return the list of nodes specified as top-level args.
 
@@ -218,7 +251,7 @@ def _apply_and_replace_all_child_nodes(
             new_args, new_kwargs, self.get_options(), new_other_args_to_resolve
         )
 
-    def apply_recursive(self, fn: "Callable[[DAGNode], T]") -> T:
+    def apply_recursive(self, fn: "Callable[[DAGNode], T]", cache=None) -> T:
         """Apply callable on each node in this DAG in a bottom-up tree walk.
 
         Args:
@@ -231,8 +264,11 @@ def apply_recursive(self, fn: "Callable[[DAGNode], T]") -> T:
         """
 
         class _CachingFn:
-            def __init__(self, fn):
-                self.cache = {}
+            def __init__(self, fn, cache=None):
+                if cache is None:
+                    self.cache = {}
+                else:
+                    self.cache = cache
                 self.fn = fn
                 self.fn.cache = self.cache
                 self.input_node_uuid = None
@@ -250,7 +286,7 @@ def __call__(self, node):
                 return self.cache[node._stable_uuid]
 
         if not type(fn).__name__ == "_CachingFn":
-            fn = _CachingFn(fn)
+            fn = _CachingFn(fn, cache)
 
         return fn(
             self._apply_and_replace_all_child_nodes(

From fe11cc36216fe37327122fb4390bdf5489206a26 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Fri, 1 Dec 2023 21:50:00 -0800
Subject: [PATCH 21/66] test metadata change in remote reader

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/tests/test_channel.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/python/ray/tests/test_channel.py b/python/ray/tests/test_channel.py
index 20e88a101efaf..0bd008593a110 100644
--- a/python/ray/tests/test_channel.py
+++ b/python/ray/tests/test_channel.py
@@ -112,6 +112,19 @@ def read(self, chan, num_writes):
                 assert chan.begin_read() == val
                 chan.end_read()
 
+            for i in range(num_writes):
+                val = i.to_bytes(100, "little")
+                assert chan.begin_read() == val
+                chan.end_read()
+
+            for val in [
+                b"hello world",
+                "hello again",
+                1000,
+            ]:
+                assert chan.begin_read() == val
+                chan.end_read()
+
     num_writes = 1000
     readers = [Reader.remote() for _ in range(num_readers)]
     done = [reader.read.remote(chan, num_writes) for reader in readers]
@@ -119,6 +132,19 @@ def read(self, chan, num_writes):
         val = i.to_bytes(8, "little")
         chan.write(val, num_readers=num_readers)
 
+    # Test different data size.
+    for i in range(num_writes):
+        val = i.to_bytes(100, "little")
+        chan.write(val, num_readers=num_readers)
+
+    # Test different metadata.
+    for val in [
+        b"hello world",
+        "hello again",
+        1000,
+    ]:
+        chan.write(val, num_readers=num_readers)
+
     ray.get(done)
 
 

From e11b6146849f68f01b1f9e84121ae5647599d67f Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Mon, 4 Dec 2023 13:55:32 -0800
Subject: [PATCH 22/66] build

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 cpp/src/ray/runtime/object/native_object_store.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/ray/runtime/object/native_object_store.cc b/cpp/src/ray/runtime/object/native_object_store.cc
index b65159a9c83fd..6d1a14ae120f9 100644
--- a/cpp/src/ray/runtime/object/native_object_store.cc
+++ b/cpp/src/ray/runtime/object/native_object_store.cc
@@ -91,7 +91,8 @@ std::vector<std::shared_ptr<msgpack::sbuffer>> NativeObjectStore::GetRaw(
     const std::vector<ObjectID> &ids, int timeout_ms) {
   auto &core_worker = CoreWorkerProcess::GetCoreWorker();
   std::vector<std::shared_ptr<::ray::RayObject>> results;
-  ::ray::Status status = core_worker.Get(ids, timeout_ms, &results);
+  ::ray::Status status = core_worker.Get(
+      ids, timeout_ms, /*is_experimental_mutable_object=*/false, &results);
   if (!status.ok()) {
     if (status.IsTimedOut()) {
       throw RayTimeoutException("Get object error:" + status.message());

From b6150a3ebcafed9995b24bd78974842e5badfffe Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Mon, 4 Dec 2023 22:22:33 -0800
Subject: [PATCH 23/66] scatter-gather DAG works

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/class_node.py        |   4 +
 python/ray/dag/compiled_dag_node.py | 192 ++++++++--------------------
 python/ray/dag/dag_node.py          |   8 +-
 3 files changed, 63 insertions(+), 141 deletions(-)

diff --git a/python/ray/dag/class_node.py b/python/ray/dag/class_node.py
index a474ffa10c553..8daf406dd2e1c 100644
--- a/python/ray/dag/class_node.py
+++ b/python/ray/dag/class_node.py
@@ -193,3 +193,7 @@ def __str__(self) -> str:
 
     def get_method_name(self) -> str:
         return self._method_name
+
+    def _get_remote_method(self, method_name):
+        method_body = getattr(self._parent_class_node, method_name)
+        return method_body
diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
index 88b46e3ef16ba..74ce83c91a917 100644
--- a/python/ray/dag/compiled_dag_node.py
+++ b/python/ray/dag/compiled_dag_node.py
@@ -1,97 +1,69 @@
-import time
-import threading
 from typing import List
-from collections import defaultdict
 
 import ray
-from ray.exceptions import RayTaskError, TaskCancelledError
+import ray.experimental.channel as ray_channel
 
 
 MAX_BUFFER_SIZE = int(100 * 1e6)  # 100MB
 
 
-def allocate_shared_output_buffer(buffer_size_bytes: int = MAX_BUFFER_SIZE):
-    assert isinstance(MAX_BUFFER_SIZE, int)
-    ref = ray.put(b"0" * buffer_size_bytes, max_readers=1)
-    # TODO(swang): Sleep to make sure that the object store sees the Seal. Should
-    # replace this with a better call to put reusable objects, and have the object
-    # store ReadRelease.
-    time.sleep(1)
-    ray.release(ref)
-    return ref
+def allocate_channel(buffer_size_bytes: int = MAX_BUFFER_SIZE, num_readers: int = 1):
+    if not isinstance(buffer_size_bytes, int):
+        raise ValueError("buffer_size_bytes must be an integer")
+    if not isinstance(num_readers, int):
+        raise ValueError("num_readers must be an integer")
 
+    return ray_channel.Channel(buffer_size_bytes, num_readers)
 
-def do_allocate_shared_output_buffer(self, buffer_size_bytes: int = MAX_BUFFER_SIZE):
-    self._output_ref = allocate_shared_output_buffer(buffer_size_bytes)
-    return self._output_ref
+
+def do_allocate_channel(
+    self, buffer_size_bytes: int = MAX_BUFFER_SIZE, num_readers: int = 1
+):
+    self._output_channel = allocate_channel(buffer_size_bytes)
+    return self._output_channel
 
 
 def do_exec_compiled_task(
     self,
-    input_refs: List[ray.ObjectRef],
+    input_channels: List["ray_channel.Channel"],
     actor_method_name: str,
-    output_max_readers: int,
 ):
     try:
-        self._input_refs = input_refs
+        self._input_channels = input_channels
         method = getattr(self, actor_method_name)
         while True:
-            inputs = ray.get(input_refs)
+            inputs = [chan.begin_read() for chan in input_channels]
             output_val = method(*inputs)
-            ray.worker.global_worker.put_object(
-                output_val,
-                object_ref=self._output_ref,
-                max_readers=output_max_readers,
-            )
-            for input_ref in input_refs:
-                ray.release(input_ref)
+
+            self._output_channel.write(output_val)
+            for chan in input_channels:
+                chan.end_read()
+
     except Exception as e:
         print("Task aborted", e)
         raise
 
 
-def do_cancel_compiled_task(self):
-    input_refs = self._input_refs
-    e = RayTaskError(
-        function_name="do_exec_compiled_task",
-        traceback_str="",
-        cause=TaskCancelledError())
-    for input_ref in self._input_refs:
-        print("Putting cancellation token", input_ref)
-        try:
-            ray.worker.global_worker.put_object(
-                e,
-                object_ref=input_ref,
-                max_readers=1,
-                try_wait=True,
-            )
-        except Exception as e:
-            if "write acquire failed" in str(e):
-                pass
-            else:
-                raise
-
-
 class CompiledTask:
     """Wraps the normal Ray DAGNode with some metadata."""
 
-    def __init__(self, idx, dag_node: "DAGNode"):
+    def __init__(self, idx, dag_node: "ray.dag.DAGNode"):
         self.idx = idx
         self.dag_node = dag_node
 
         self.args = []
         self.dependent_node_idxs = []
-        self.output_ref = None
+        self.output_channel = None
 
     @property
-    def max_readers(self):
+    def num_readers(self):
         return len(self.dependent_node_idxs)
 
     def __str__(self):
         return f"""
 Node: {self.dag_node}
 Arguments: {self.args}
-Output: {self.output_ref}
+Output: {self.output_channel}
 """
 
 
@@ -106,14 +78,12 @@ def __init__(self):
 
         self.input_task_idx = None
         self.output_task_idx = None
-        self.node_idx_to_output_refs = {}
+        self.node_idx_to_output_channels = {}
 
         # Cached.
         self.dag_input_ref = None
-        self.dag_input_max_readers = None
-        self.dag_output_refs = None
+        self.dag_output_channels = None
         self.worker_task_refs = []
-        self.actor_refs = set()
 
     def add_node(self, node):
         idx = self.counter
@@ -122,7 +92,7 @@ def add_node(self, node):
         self.counter += 1
 
     def preprocess(self):
-        from ray.dag import DAGNode, InputNode, OutputNode
+        from ray.dag import DAGNode, InputNode
 
         for idx, task in self.idx_to_task.items():
             task.args = task.dag_node.get_args()
@@ -140,21 +110,20 @@ def preprocess(self):
 
         for idx, task in self.idx_to_task.items():
             if len(task.dependent_node_idxs) == 0:
-                assert (
-                    self.output_task_idx is None
-                ), "More than one output node found, make sure only one node has 0 dependent tasks"
+                assert self.output_task_idx is None, (
+                    "More than one output node found, "
+                    "make sure only one node has 0 dependent tasks"
+                )
                 self.output_task_idx = idx
 
     def compiled(self):
         from ray.dag import DAGNode, InputNode, OutputNode, ClassMethodNode
 
-        if self.dag_input_ref is not None and self.dag_output_refs is not None:
+        if self.dag_input_ref is not None and self.dag_output_channels is not None:
             # Driver should ray.put on input, ray.get/release on output
             return (
                 self.dag_input_ref,
-                self.dag_input_max_readers,
-                self.dag_output_refs,
-                self.monitor,
+                self.dag_output_channels,
             )
 
         queue = [self.input_task_idx]
@@ -167,16 +136,18 @@ def compiled(self):
             visited.add(cur_idx)
 
             task = self.idx_to_task[cur_idx]
-            dependent_node_idxs = task.dependent_node_idxs
-
             # Create an output buffer on the actor.
-            assert task.output_ref is None
+            assert task.output_channel is None
             if isinstance(task.dag_node, ClassMethodNode):
-                fn = task.dag_node._get_remote_method("__ray_apply__")
-                task.output_ref = ray.get(fn.remote(do_allocate_shared_output_buffer))
-                self.actor_refs.add(task.dag_node._get_actor())
+                fn = task.dag_node._get_remote_method("__ray_call__")
+                task.output_channel = ray.get(
+                    fn.remote(
+                        do_allocate_channel,
+                        num_readers=task.num_readers,
+                    )
+                )
             elif isinstance(task.dag_node, InputNode):
-                task.output_ref = allocate_shared_output_buffer()
+                task.output_channel = allocate_channel(num_readers=task.num_readers)
             else:
                 assert isinstance(task.dag_node, OutputNode)
 
@@ -188,7 +159,6 @@ def compiled(self):
         # it's not already there.
         assert isinstance(output_node, OutputNode)
 
-        work_refs = []
         for node_idx, task in self.idx_to_task.items():
             if node_idx == self.input_task_idx:
                 # We don't need to assign an actual task for the input node.
@@ -203,91 +173,41 @@ def compiled(self):
                 # TODO(swang): Support non-ObjectRef args.
                 assert isinstance(arg, DAGNode)
                 arg_idx = self.dag_node_to_idx[arg]
-                arg_buffer = self.idx_to_task[arg_idx].output_ref
+                arg_buffer = self.idx_to_task[arg_idx].output_channel
                 assert arg_buffer is not None
                 resolved_args.append(arg_buffer)
 
             # TODO: Assign the task with the correct input and output buffers.
-            worker_fn = task.dag_node._get_remote_method("__ray_apply__")
+            worker_fn = task.dag_node._get_remote_method("__ray_call__")
             self.worker_task_refs.append(
-                worker_fn.options(concurrency_group="_ray_system").remote(
+                worker_fn.remote(
                     do_exec_compiled_task,
                     resolved_args,
                     task.dag_node.get_method_name(),
-                    task.max_readers,
                 )
             )
 
-        self.dag_input_ref = self.idx_to_task[self.input_task_idx].output_ref
-        self.dag_input_max_readers = self.idx_to_task[self.input_task_idx].max_readers
+        self.dag_input_ref = self.idx_to_task[self.input_task_idx].output_channel
 
-        self.dag_output_refs = []
+        self.dag_output_channels = []
         for output in self.idx_to_task[self.output_task_idx].args:
             assert isinstance(output, DAGNode)
             output_idx = self.dag_node_to_idx[output]
-            self.dag_output_refs.append(self.idx_to_task[output_idx].output_ref)
+            self.dag_output_channels.append(self.idx_to_task[output_idx].output_channel)
 
         assert self.dag_input_ref
-        assert self.dag_output_refs
+        assert self.dag_output_channels
         # Driver should ray.put on input, ray.get/release on output
-        self.monitor = self.monitor_failures()
-        return (self.dag_input_ref, self.dag_input_max_readers, self.dag_output_refs, self.monitor)
-
-    def monitor_failures(self):
-        outer = self
-
-        class Monitor(threading.Thread):
-            def __init__(self):
-                super().__init__(daemon=True)
-                self.in_destroy = False
-
-            def destroy(self):
-                if self.in_destroy:
-                    return
-                self.in_destroy = True
-                for actor in outer.actor_refs:
-                    print("Cancelling compiled worker on actor", actor)
-                    try:
-                        ray.get(actor.__ray_apply__.remote(do_cancel_compiled_task))
-                    except Exception as e:
-                        print("Error cancelling", e)
-                        pass
-
-            def run(self):
-                try:
-                    ray.get(outer.worker_task_refs)
-                except Exception as e:
-                    if self.in_destroy:
-                        return
-                    print("Worker task exception", e)
-                    for output_ref in outer.dag_output_refs:
-                        print("Putting error", output_ref)
-                        try:
-                            ray.worker.global_worker.put_object(
-                                e,
-                                object_ref=output_ref,
-                                max_readers=1,
-                                try_wait=True,
-                            )
-                        except Exception as f:
-                            if "write acquire failed" in str(f):
-                                pass
-                            else:
-                                raise
-                    self.destroy()
-
-        monitor = Monitor()
-        monitor.start()
-        return monitor
-
-
-def build_compiled_dag(dag: "DAGNode"):
+        return (self.dag_input_ref, self.dag_output_channels)
+
+
+def build_compiled_dag(dag: "ray.dag.DAGNode"):
     compiled_dag = CompiledDAG()
 
-    def build_compiled_dag(node):
+    def _build_compiled_dag(node):
         compiled_dag.add_node(node)
         return node
 
-    dag.apply_recursive(build_compiled_dag)
+    dag.apply_recursive(_build_compiled_dag)
     compiled_dag.preprocess()
     return compiled_dag
diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py
index 5a1f2251438ac..983db7ed35b5d 100644
--- a/python/ray/dag/dag_node.py
+++ b/python/ray/dag/dag_node.py
@@ -132,11 +132,9 @@ def execute(
         """
         if compiled:
             assert len(args) == 1, "Compiled DAGs support exactly one InputNode arg"
-            input_ref, input_max_readers, output_ref, _ = self.compiled()
-            ray.worker.global_worker.put_object(
-                args[0], object_ref=input_ref, max_readers=input_max_readers
-            )
-            return output_ref
+            input_ref, output_channels = self.compiled()
+            input_ref.write(args[0])
+            return output_channels
 
         def executor(node):
             return node._execute_impl(*args, **kwargs)

From 533626268b64e029b505b90365c37d7afe47f826 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 5 Dec 2023 10:40:40 -0800
Subject: [PATCH 24/66] fix

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 src/ray/object_manager/plasma/client.cc       | 12 ++++++------
 src/ray/object_manager/plasma/object_store.cc |  1 -
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index f566392368ab0..53f3eba9e1583 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -876,12 +876,12 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) {
 
   object_entry->second->is_sealed = true;
   auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object);
-  if (plasma_header->num_readers != 0) {
-    plasma_header->WriteRelease(
-        /*write_version=*/object_entry->second->next_version_to_write);
-    // The next Write must pass a higher version.
-    object_entry->second->next_version_to_write++;
-  } else {
+  plasma_header->WriteRelease(
+      /*write_version=*/object_entry->second->next_version_to_write);
+  // The next Write must pass a higher version.
+  object_entry->second->next_version_to_write++;
+
+  if (plasma_header->num_readers <= 0) {
     // Send the seal request to Plasma. This is the normal Seal path, used for
     // immutable objects and the initial Create call for mutable objects.
     RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id));
diff --git a/src/ray/object_manager/plasma/object_store.cc b/src/ray/object_manager/plasma/object_store.cc
index 8f4178dc9b797..0615e2b5ad023 100644
--- a/src/ray/object_manager/plasma/object_store.cc
+++ b/src/ray/object_manager/plasma/object_store.cc
@@ -82,7 +82,6 @@ bool ObjectStore::DeleteObject(const ObjectID &object_id) {
   if (entry == nullptr) {
     return false;
   }
-  // TODO(swang): Make sure Seal coroutine is done before deleting.
   auto plasma_header = entry->GetPlasmaObjectHeader();
   plasma_header->Destroy();
 

From 99a38c2068a01e96a4e03469dbcd385ba92e94c9 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 5 Dec 2023 10:40:40 -0800
Subject: [PATCH 25/66] fix

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 src/ray/object_manager/plasma/client.cc       | 12 ++++++------
 src/ray/object_manager/plasma/object_store.cc |  1 -
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index f566392368ab0..53f3eba9e1583 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -876,12 +876,12 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) {
 
   object_entry->second->is_sealed = true;
   auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object);
-  if (plasma_header->num_readers != 0) {
-    plasma_header->WriteRelease(
-        /*write_version=*/object_entry->second->next_version_to_write);
-    // The next Write must pass a higher version.
-    object_entry->second->next_version_to_write++;
-  } else {
+  plasma_header->WriteRelease(
+      /*write_version=*/object_entry->second->next_version_to_write);
+  // The next Write must pass a higher version.
+  object_entry->second->next_version_to_write++;
+
+  if (plasma_header->num_readers <= 0) {
     // Send the seal request to Plasma. This is the normal Seal path, used for
     // immutable objects and the initial Create call for mutable objects.
     RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id));
diff --git a/src/ray/object_manager/plasma/object_store.cc b/src/ray/object_manager/plasma/object_store.cc
index 8f4178dc9b797..0615e2b5ad023 100644
--- a/src/ray/object_manager/plasma/object_store.cc
+++ b/src/ray/object_manager/plasma/object_store.cc
@@ -82,7 +82,6 @@ bool ObjectStore::DeleteObject(const ObjectID &object_id) {
   if (entry == nullptr) {
     return false;
   }
-  // TODO(swang): Make sure Seal coroutine is done before deleting.
   auto plasma_header = entry->GetPlasmaObjectHeader();
   plasma_header->Destroy();
 

From e88c40f37ccc5059a17059d5d628c3e2436725a0 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 5 Dec 2023 16:44:10 -0800
Subject: [PATCH 26/66] fix

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 src/ray/object_manager/plasma/client.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index 53f3eba9e1583..99caa590180f3 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -623,7 +623,6 @@ Status PlasmaClient::Impl::GetBuffers(
   RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaGetReply, &buffer));
   std::vector<ObjectID> received_object_ids(num_objects);
   std::vector<PlasmaObject> object_data(num_objects);
-  auto object = std::make_unique<PlasmaObject>();
   std::vector<MEMFD_TYPE> store_fds;
   std::vector<int64_t> mmap_sizes;
   RAY_RETURN_NOT_OK(ReadGetReply(buffer.data(),
@@ -644,9 +643,10 @@ Status PlasmaClient::Impl::GetBuffers(
     GetStoreFdAndMmap(store_fds[i], mmap_sizes[i]);
   }
 
+  std::unique_ptr<PlasmaObject> object;
   for (int64_t i = 0; i < num_objects; ++i) {
     RAY_DCHECK(received_object_ids[i] == object_ids[i]);
-    *object = object_data[i];
+    object = std::make_unique<PlasmaObject>(object_data[i]);
     if (object_buffers[i].data) {
       // If the object was already in use by the client, then the store should
       // have returned it.

From 95e871b441c25c89249650e287034cab9b3c957a Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 5 Dec 2023 17:00:45 -0800
Subject: [PATCH 27/66] compile?

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 src/ray/object_manager/common.cc        | 32 +++++++++++++++----------
 src/ray/object_manager/common.h         | 17 ++++++++-----
 src/ray/object_manager/plasma/client.cc | 28 +++++++++++++++-------
 3 files changed, 50 insertions(+), 27 deletions(-)

diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc
index 35a21ce0e4654..e35a2807942a8 100644
--- a/src/ray/object_manager/common.cc
+++ b/src/ray/object_manager/common.cc
@@ -2,19 +2,8 @@
 
 namespace ray {
 
-void PrintPlasmaObjectHeader(const PlasmaObjectHeader *header) {
-  RAY_LOG(DEBUG) << "PlasmaObjectHeader: \n"
-                 << "version: " << header->version << "\n"
-                 << "num_readers: " << header->num_readers << "\n"
-                 << "num_read_acquires_remaining: " << header->num_read_acquires_remaining
-                 << "\n"
-                 << "num_read_releases_remaining: " << header->num_read_releases_remaining
-                 << "\n"
-                 << "data_size: " << header->data_size << "\n"
-                 << "metadata_size: " << header->metadata_size << "\n";
-}
-
 void PlasmaObjectHeader::Init() {
+#ifndef _WIN32
   // wr_mut is shared between writer and readers.
   pthread_mutexattr_t mutex_attr;
   pthread_mutexattr_init(&mutex_attr);
@@ -29,12 +18,29 @@ void PlasmaObjectHeader::Init() {
   pthread_condattr_init(&cond_attr);
   pthread_condattr_setpshared(&cond_attr, PTHREAD_PROCESS_SHARED);
   pthread_cond_init(&cond, &cond_attr);
+#endif
 }
 
 void PlasmaObjectHeader::Destroy() {
+#ifndef _WIN32
   RAY_CHECK(pthread_mutex_destroy(&wr_mut) == 0);
   RAY_CHECK(pthread_cond_destroy(&cond) == 0);
   RAY_CHECK(sem_destroy(&rw_semaphore) == 0);
+#endif
+}
+
+#ifndef _WIN32
+
+void PrintPlasmaObjectHeader(const PlasmaObjectHeader *header) {
+  RAY_LOG(DEBUG) << "PlasmaObjectHeader: \n"
+                 << "version: " << header->version << "\n"
+                 << "num_readers: " << header->num_readers << "\n"
+                 << "num_read_acquires_remaining: " << header->num_read_acquires_remaining
+                 << "\n"
+                 << "num_read_releases_remaining: " << header->num_read_releases_remaining
+                 << "\n"
+                 << "data_size: " << header->data_size << "\n"
+                 << "metadata_size: " << header->metadata_size << "\n";
 }
 
 void PlasmaObjectHeader::WriteAcquire(int64_t write_version,
@@ -156,4 +162,6 @@ void PlasmaObjectHeader::ReadRelease(int64_t read_version) {
   }
 }
 
+#endif
+
 }  // namespace ray
diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h
index eea4ccd8eb7ba..4a4d8404a94d5 100644
--- a/src/ray/object_manager/common.h
+++ b/src/ray/object_manager/common.h
@@ -14,7 +14,9 @@
 
 #pragma once
 
+#ifndef _WIN32
 #include <semaphore.h>
+#endif
 
 #include <atomic>
 #include <boost/asio.hpp>
@@ -46,6 +48,8 @@ using RestoreSpilledObjectCallback =
 /// needed once the object has been Sealed. For experimental mutable objects,
 /// we use the header to synchronize between writer and readers.
 struct PlasmaObjectHeader {
+// TODO(swang): PlasmaObjectHeader uses pthreads, POSIX mutex and semaphore.
+#ifndef _WIN32
   // Used to signal to the writer when all readers are done.
   sem_t rw_semaphore;
 
@@ -89,12 +93,6 @@ struct PlasmaObjectHeader {
   uint64_t data_size = 0;
   uint64_t metadata_size = 0;
 
-  /// Setup synchronization primitives.
-  void Init();
-
-  /// Destroy synchronization primitives.
-  void Destroy();
-
   /// Blocks until all readers for the previous write have ReadRelease'd the
   /// value. Protects against concurrent writers. Caller must pass consecutive
   /// versions on each new write, starting with write_version=1.
@@ -131,6 +129,13 @@ struct PlasmaObjectHeader {
   /// \param read_version This must match the version previously passed in
   /// ReadAcquire.
   void ReadRelease(int64_t read_version);
+#endif
+
+  /// Setup synchronization primitives.
+  void Init();
+
+  /// Destroy synchronization primitives.
+  void Destroy();
 };
 
 /// A struct that includes info about the object.
diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index 99caa590180f3..f394c014b7a75 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -233,7 +233,13 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
 
   uint8_t *LookupMmappedFile(MEMFD_TYPE store_fd_val) const;
 
-  ray::PlasmaObjectHeader *GetPlasmaObjectHeader(const PlasmaObject &object) const;
+#ifndef _WIN32
+  ray::PlasmaObjectHeader *GetPlasmaObjectHeader(const PlasmaObject &object) const {
+    auto base_ptr = LookupMmappedFile(object.store_fd);
+    auto header_ptr = base_ptr + object.header_offset;
+    return reinterpret_cast<ray::PlasmaObjectHeader *>(header_ptr);
+  }
+#endif
 
   void InsertObjectInUse(const ObjectID &object_id,
                          std::unique_ptr<PlasmaObject> object,
@@ -303,13 +309,6 @@ uint8_t *PlasmaClient::Impl::LookupMmappedFile(MEMFD_TYPE store_fd_val) const {
   return entry->second->pointer();
 }
 
-ray::PlasmaObjectHeader *PlasmaClient::Impl::GetPlasmaObjectHeader(
-    const PlasmaObject &object) const {
-  auto base_ptr = LookupMmappedFile(object.store_fd);
-  auto header_ptr = base_ptr + object.header_offset;
-  return reinterpret_cast<ray::PlasmaObjectHeader *>(header_ptr);
-}
-
 bool PlasmaClient::Impl::IsInUse(const ObjectID &object_id) {
   std::lock_guard<std::recursive_mutex> guard(client_mutex_);
 
@@ -411,6 +410,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire(
     int64_t metadata_size,
     int64_t num_readers,
     std::shared_ptr<Buffer> *data) {
+#ifndef _WIN32
   std::unique_lock<std::recursive_mutex> guard(client_mutex_);
   auto object_entry = objects_in_use_.find(object_id);
   if (object_entry == objects_in_use_.end()) {
@@ -455,6 +455,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire(
   }
 
   entry->is_sealed = false;
+#endif
   return Status::OK();
 }
 
@@ -503,6 +504,7 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
     RAY_CHECK(!entry->is_sealed);
     entry->is_mutable = is_mutable;
 
+#ifndef _WIN32
     auto plasma_header = GetPlasmaObjectHeader(entry->object);
     if (entry->is_mutable) {
       entry->is_writer = true;
@@ -518,6 +520,7 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
                                   // Anyone may read an immutable object.
                                   /*num_readers=*/-1);
     }
+#endif
   }
 
   return status;
@@ -715,6 +718,7 @@ Status PlasmaClient::Impl::Get(const std::vector<ObjectID> &object_ids,
 
 Status PlasmaClient::Impl::EnsureGetAcquired(
     std::unique_ptr<ObjectInUseEntry> &object_entry) {
+#ifndef _WIN32
   PlasmaObject *object = &object_entry->object;
   auto plasma_header = GetPlasmaObjectHeader(*object);
   if (object_entry->read_acquired) {
@@ -743,11 +747,13 @@ Status PlasmaClient::Impl::EnsureGetAcquired(
     RAY_CHECK(object_entry->object.data_size + object_entry->object.metadata_size <=
               object_entry->object.allocated_size);
   }
+#endif
   return Status::OK();
 }
 
 Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease(
     const ObjectID &object_id) {
+#ifndef _WIN32
   RAY_LOG(DEBUG) << "Try to release Get for object " << object_id;
   std::unique_lock<std::recursive_mutex> guard(client_mutex_);
 
@@ -773,7 +779,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease(
   // The next read needs to read at least this version.
   entry->next_version_to_read++;
   entry->read_acquired = false;
-
+#endif
   return Status::OK();
 }
 
@@ -875,6 +881,7 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) {
   }
 
   object_entry->second->is_sealed = true;
+#ifndef _WIN32
   auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object);
   plasma_header->WriteRelease(
       /*write_version=*/object_entry->second->next_version_to_write);
@@ -882,6 +889,9 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) {
   object_entry->second->next_version_to_write++;
 
   if (plasma_header->num_readers <= 0) {
+#else
+  {
+#endif
     // Send the seal request to Plasma. This is the normal Seal path, used for
     // immutable objects and the initial Create call for mutable objects.
     RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id));

From 204bb9bdc3e5514d821acfc129a32357a8963a5c Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 5 Dec 2023 16:44:10 -0800
Subject: [PATCH 28/66] fix

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 src/ray/object_manager/plasma/client.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index 53f3eba9e1583..99caa590180f3 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -623,7 +623,6 @@ Status PlasmaClient::Impl::GetBuffers(
   RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaGetReply, &buffer));
   std::vector<ObjectID> received_object_ids(num_objects);
   std::vector<PlasmaObject> object_data(num_objects);
-  auto object = std::make_unique<PlasmaObject>();
   std::vector<MEMFD_TYPE> store_fds;
   std::vector<int64_t> mmap_sizes;
   RAY_RETURN_NOT_OK(ReadGetReply(buffer.data(),
@@ -644,9 +643,10 @@ Status PlasmaClient::Impl::GetBuffers(
     GetStoreFdAndMmap(store_fds[i], mmap_sizes[i]);
   }
 
+  std::unique_ptr<PlasmaObject> object;
   for (int64_t i = 0; i < num_objects; ++i) {
     RAY_DCHECK(received_object_ids[i] == object_ids[i]);
-    *object = object_data[i];
+    object = std::make_unique<PlasmaObject>(object_data[i]);
     if (object_buffers[i].data) {
       // If the object was already in use by the client, then the store should
       // have returned it.

From 4703f34eed5909b8c088636f444d992ef9a3c1e7 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 5 Dec 2023 17:00:45 -0800
Subject: [PATCH 29/66] compile?

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 src/ray/object_manager/common.cc        | 32 +++++++++++++++----------
 src/ray/object_manager/common.h         | 17 ++++++++-----
 src/ray/object_manager/plasma/client.cc | 28 +++++++++++++++-------
 3 files changed, 50 insertions(+), 27 deletions(-)

diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc
index 35a21ce0e4654..e35a2807942a8 100644
--- a/src/ray/object_manager/common.cc
+++ b/src/ray/object_manager/common.cc
@@ -2,19 +2,8 @@
 
 namespace ray {
 
-void PrintPlasmaObjectHeader(const PlasmaObjectHeader *header) {
-  RAY_LOG(DEBUG) << "PlasmaObjectHeader: \n"
-                 << "version: " << header->version << "\n"
-                 << "num_readers: " << header->num_readers << "\n"
-                 << "num_read_acquires_remaining: " << header->num_read_acquires_remaining
-                 << "\n"
-                 << "num_read_releases_remaining: " << header->num_read_releases_remaining
-                 << "\n"
-                 << "data_size: " << header->data_size << "\n"
-                 << "metadata_size: " << header->metadata_size << "\n";
-}
-
 void PlasmaObjectHeader::Init() {
+#ifndef _WIN32
   // wr_mut is shared between writer and readers.
   pthread_mutexattr_t mutex_attr;
   pthread_mutexattr_init(&mutex_attr);
@@ -29,12 +18,29 @@ void PlasmaObjectHeader::Init() {
   pthread_condattr_init(&cond_attr);
   pthread_condattr_setpshared(&cond_attr, PTHREAD_PROCESS_SHARED);
   pthread_cond_init(&cond, &cond_attr);
+#endif
 }
 
 void PlasmaObjectHeader::Destroy() {
+#ifndef _WIN32
   RAY_CHECK(pthread_mutex_destroy(&wr_mut) == 0);
   RAY_CHECK(pthread_cond_destroy(&cond) == 0);
   RAY_CHECK(sem_destroy(&rw_semaphore) == 0);
+#endif
+}
+
+#ifndef _WIN32
+
+void PrintPlasmaObjectHeader(const PlasmaObjectHeader *header) {
+  RAY_LOG(DEBUG) << "PlasmaObjectHeader: \n"
+                 << "version: " << header->version << "\n"
+                 << "num_readers: " << header->num_readers << "\n"
+                 << "num_read_acquires_remaining: " << header->num_read_acquires_remaining
+                 << "\n"
+                 << "num_read_releases_remaining: " << header->num_read_releases_remaining
+                 << "\n"
+                 << "data_size: " << header->data_size << "\n"
+                 << "metadata_size: " << header->metadata_size << "\n";
 }
 
 void PlasmaObjectHeader::WriteAcquire(int64_t write_version,
@@ -156,4 +162,6 @@ void PlasmaObjectHeader::ReadRelease(int64_t read_version) {
   }
 }
 
+#endif
+
 }  // namespace ray
diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h
index eea4ccd8eb7ba..4a4d8404a94d5 100644
--- a/src/ray/object_manager/common.h
+++ b/src/ray/object_manager/common.h
@@ -14,7 +14,9 @@
 
 #pragma once
 
+#ifndef _WIN32
 #include <semaphore.h>
+#endif
 
 #include <atomic>
 #include <boost/asio.hpp>
@@ -46,6 +48,8 @@ using RestoreSpilledObjectCallback =
 /// needed once the object has been Sealed. For experimental mutable objects,
 /// we use the header to synchronize between writer and readers.
 struct PlasmaObjectHeader {
+// TODO(swang): PlasmaObjectHeader uses pthreads, POSIX mutex and semaphore.
+#ifndef _WIN32
   // Used to signal to the writer when all readers are done.
   sem_t rw_semaphore;
 
@@ -89,12 +93,6 @@ struct PlasmaObjectHeader {
   uint64_t data_size = 0;
   uint64_t metadata_size = 0;
 
-  /// Setup synchronization primitives.
-  void Init();
-
-  /// Destroy synchronization primitives.
-  void Destroy();
-
   /// Blocks until all readers for the previous write have ReadRelease'd the
   /// value. Protects against concurrent writers. Caller must pass consecutive
   /// versions on each new write, starting with write_version=1.
@@ -131,6 +129,13 @@ struct PlasmaObjectHeader {
   /// \param read_version This must match the version previously passed in
   /// ReadAcquire.
   void ReadRelease(int64_t read_version);
+#endif
+
+  /// Setup synchronization primitives.
+  void Init();
+
+  /// Destroy synchronization primitives.
+  void Destroy();
 };
 
 /// A struct that includes info about the object.
diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index 99caa590180f3..f394c014b7a75 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -233,7 +233,13 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
 
   uint8_t *LookupMmappedFile(MEMFD_TYPE store_fd_val) const;
 
-  ray::PlasmaObjectHeader *GetPlasmaObjectHeader(const PlasmaObject &object) const;
+#ifndef _WIN32
+  ray::PlasmaObjectHeader *GetPlasmaObjectHeader(const PlasmaObject &object) const {
+    auto base_ptr = LookupMmappedFile(object.store_fd);
+    auto header_ptr = base_ptr + object.header_offset;
+    return reinterpret_cast<ray::PlasmaObjectHeader *>(header_ptr);
+  }
+#endif
 
   void InsertObjectInUse(const ObjectID &object_id,
                          std::unique_ptr<PlasmaObject> object,
@@ -303,13 +309,6 @@ uint8_t *PlasmaClient::Impl::LookupMmappedFile(MEMFD_TYPE store_fd_val) const {
   return entry->second->pointer();
 }
 
-ray::PlasmaObjectHeader *PlasmaClient::Impl::GetPlasmaObjectHeader(
-    const PlasmaObject &object) const {
-  auto base_ptr = LookupMmappedFile(object.store_fd);
-  auto header_ptr = base_ptr + object.header_offset;
-  return reinterpret_cast<ray::PlasmaObjectHeader *>(header_ptr);
-}
-
 bool PlasmaClient::Impl::IsInUse(const ObjectID &object_id) {
   std::lock_guard<std::recursive_mutex> guard(client_mutex_);
 
@@ -411,6 +410,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire(
     int64_t metadata_size,
     int64_t num_readers,
     std::shared_ptr<Buffer> *data) {
+#ifndef _WIN32
   std::unique_lock<std::recursive_mutex> guard(client_mutex_);
   auto object_entry = objects_in_use_.find(object_id);
   if (object_entry == objects_in_use_.end()) {
@@ -455,6 +455,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire(
   }
 
   entry->is_sealed = false;
+#endif
   return Status::OK();
 }
 
@@ -503,6 +504,7 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
     RAY_CHECK(!entry->is_sealed);
     entry->is_mutable = is_mutable;
 
+#ifndef _WIN32
     auto plasma_header = GetPlasmaObjectHeader(entry->object);
     if (entry->is_mutable) {
       entry->is_writer = true;
@@ -518,6 +520,7 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
                                   // Anyone may read an immutable object.
                                   /*num_readers=*/-1);
     }
+#endif
   }
 
   return status;
@@ -715,6 +718,7 @@ Status PlasmaClient::Impl::Get(const std::vector<ObjectID> &object_ids,
 
 Status PlasmaClient::Impl::EnsureGetAcquired(
     std::unique_ptr<ObjectInUseEntry> &object_entry) {
+#ifndef _WIN32
   PlasmaObject *object = &object_entry->object;
   auto plasma_header = GetPlasmaObjectHeader(*object);
   if (object_entry->read_acquired) {
@@ -743,11 +747,13 @@ Status PlasmaClient::Impl::EnsureGetAcquired(
     RAY_CHECK(object_entry->object.data_size + object_entry->object.metadata_size <=
               object_entry->object.allocated_size);
   }
+#endif
   return Status::OK();
 }
 
 Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease(
     const ObjectID &object_id) {
+#ifndef _WIN32
   RAY_LOG(DEBUG) << "Try to release Get for object " << object_id;
   std::unique_lock<std::recursive_mutex> guard(client_mutex_);
 
@@ -773,7 +779,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease(
   // The next read needs to read at least this version.
   entry->next_version_to_read++;
   entry->read_acquired = false;
-
+#endif
   return Status::OK();
 }
 
@@ -875,6 +881,7 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) {
   }
 
   object_entry->second->is_sealed = true;
+#ifndef _WIN32
   auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object);
   plasma_header->WriteRelease(
       /*write_version=*/object_entry->second->next_version_to_write);
@@ -882,6 +889,9 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) {
   object_entry->second->next_version_to_write++;
 
   if (plasma_header->num_readers <= 0) {
+#else
+  {
+#endif
     // Send the seal request to Plasma. This is the normal Seal path, used for
     // immutable objects and the initial Create call for mutable objects.
     RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id));

From 420bd1c060fadb676e228890900890db29a1c538 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 5 Dec 2023 21:20:50 -0800
Subject: [PATCH 30/66] build

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/tests/BUILD                        | 1 +
 src/ray/object_manager/plasma/object_store.cc | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD
index 6560b38950af6..3a48b4edd888a 100644
--- a/python/ray/tests/BUILD
+++ b/python/ray/tests/BUILD
@@ -249,6 +249,7 @@ py_test_module_list(
     "test_annotations.py",
     "test_args.py",
     "test_asyncio_cluster.py",
+    "test_channel.py",
     "test_concurrency_group.py",
     "test_component_failures.py",
     "test_cross_language.py",
diff --git a/src/ray/object_manager/plasma/object_store.cc b/src/ray/object_manager/plasma/object_store.cc
index 0615e2b5ad023..a9324ad39d90f 100644
--- a/src/ray/object_manager/plasma/object_store.cc
+++ b/src/ray/object_manager/plasma/object_store.cc
@@ -71,9 +71,11 @@ const LocalObject *ObjectStore::SealObject(const ObjectID &object_id) {
   entry->state = ObjectState::PLASMA_SEALED;
   entry->construct_duration = std::time(nullptr) - entry->create_time;
   auto plasma_header = entry->GetPlasmaObjectHeader();
+#ifndef _WIN32
   if (!entry->object_info.is_mutable) {
     RAY_CHECK(plasma_header->num_readers == -1) << plasma_header->num_readers;
   }
+#endif
   return entry;
 }
 

From 4cabbc59f684d8bfaab97616b61229b228dc6cdb Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 6 Dec 2023 08:54:16 -0800
Subject: [PATCH 31/66] x

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 src/ray/object_manager/plasma/client.cc | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index f394c014b7a75..13b90b44df050 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -661,14 +661,19 @@ Status PlasmaClient::Impl::GetBuffers(
     // If we are here, the object was not currently in use, so we need to
     // process the reply from the object store.
     if (object->data_size != -1) {
-      // Increment the count of the number of instances of this object that this
-      // client is using. Cache the reference to the object.
-      InsertObjectInUse(received_object_ids[i], std::move(object), /*is_sealed=*/true);
+      if (objects_in_use_.find(received_object_ids[i]) == objects_in_use_.end()) {
+        // Increment the count of the number of instances of this object that this
+        // client is using. Cache the reference to the object.
+        InsertObjectInUse(received_object_ids[i], std::move(object), /*is_sealed=*/true);
+      } else {
+        IncrementObjectCount(received_object_ids[i]);
+      }
       auto &object_entry = objects_in_use_[received_object_ids[i]];
 
       // Wait for the object to become ready to read.
-      RAY_CHECK(!object_entry->read_acquired);
-      RAY_RETURN_NOT_OK(EnsureGetAcquired(object_entry));
+      if (!object_entry->read_acquired) {
+        RAY_RETURN_NOT_OK(EnsureGetAcquired(object_entry));
+      }
       std::shared_ptr<Buffer> physical_buf;
       RAY_LOG(DEBUG) << "Plasma Get " << received_object_ids[i]
                      << ", data size: " << object_entry->object.data_size

From b44ef8ae5d7edb40e1b233abdf145e476c088d3a Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 6 Dec 2023 09:24:49 -0800
Subject: [PATCH 32/66] fix

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 src/ray/object_manager/plasma/client.cc | 72 ++++++++++++++-----------
 1 file changed, 40 insertions(+), 32 deletions(-)

diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index 13b90b44df050..d06f073c91a01 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -146,6 +146,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
 
   Status RetryCreate(const ObjectID &object_id,
                      uint64_t request_id,
+                     bool is_mutable,
                      const uint8_t *metadata,
                      uint64_t *retry_with_request_id,
                      std::shared_ptr<Buffer> *data);
@@ -204,6 +205,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
  private:
   /// Helper method to read and process the reply of a create request.
   Status HandleCreateReply(const ObjectID &object_id,
+                           bool is_mutable,
                            const uint8_t *metadata,
                            uint64_t *retry_with_request_id,
                            std::shared_ptr<Buffer> *data);
@@ -341,6 +343,7 @@ void PlasmaClient::Impl::IncrementObjectCount(const ObjectID &object_id) {
 }
 
 Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id,
+                                             bool is_mutable,
                                              const uint8_t *metadata,
                                              uint64_t *retry_with_request_id,
                                              std::shared_ptr<Buffer> *data) {
@@ -400,6 +403,32 @@ Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id,
   // buffer returned by PlasmaClient::Create goes out of scope, the object does
   // not get released before the call to PlasmaClient::Seal happens.
   IncrementObjectCount(object_id);
+
+  // Create IPC was successful.
+  auto object_entry = objects_in_use_.find(object_id);
+  RAY_CHECK(object_entry != objects_in_use_.end());
+  auto &entry = object_entry->second;
+  RAY_CHECK(!entry->is_sealed);
+  entry->is_mutable = is_mutable;
+
+#ifndef _WIN32
+  auto plasma_header = GetPlasmaObjectHeader(entry->object);
+  if (entry->is_mutable) {
+    entry->is_writer = true;
+  } else {
+    // The first creation's version is always 1.
+    RAY_CHECK(entry->next_version_to_write == 1);
+    // The corresponding WriteRelease takes place in Seal.
+    // When an object is first created, the data size is equivalent to
+    // buffer size.
+    plasma_header->WriteAcquire(entry->next_version_to_write,
+                                entry->object.data_size,
+                                entry->object.metadata_size,
+                                // Anyone may read an immutable object.
+                                /*num_readers=*/-1);
+  }
+#endif
+
   return Status::OK();
 }
 
@@ -482,7 +511,8 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
                                       source,
                                       device_num,
                                       /*try_immediately=*/false));
-  Status status = HandleCreateReply(object_id, metadata, &retry_with_request_id, data);
+  Status status =
+      HandleCreateReply(object_id, is_mutable, metadata, &retry_with_request_id, data);
 
   while (retry_with_request_id > 0) {
     guard.unlock();
@@ -492,35 +522,12 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
     guard.lock();
     RAY_LOG(DEBUG) << "Retrying request for object " << object_id << " with request ID "
                    << retry_with_request_id;
-    status = RetryCreate(
-        object_id, retry_with_request_id, metadata, &retry_with_request_id, data);
-  }
-
-  if (status.ok()) {
-    // Create IPC was successful.
-    auto object_entry = objects_in_use_.find(object_id);
-    RAY_CHECK(object_entry != objects_in_use_.end());
-    auto &entry = object_entry->second;
-    RAY_CHECK(!entry->is_sealed);
-    entry->is_mutable = is_mutable;
-
-#ifndef _WIN32
-    auto plasma_header = GetPlasmaObjectHeader(entry->object);
-    if (entry->is_mutable) {
-      entry->is_writer = true;
-    } else {
-      // The first creation's version is always 1.
-      RAY_CHECK(entry->next_version_to_write == 1);
-      // The corresponding WriteRelease takes place in Seal.
-      // When an object is first created, the data size is equivalent to
-      // buffer size.
-      plasma_header->WriteAcquire(entry->next_version_to_write,
-                                  data_size,
-                                  metadata_size,
-                                  // Anyone may read an immutable object.
-                                  /*num_readers=*/-1);
-    }
-#endif
+    status = RetryCreate(object_id,
+                         retry_with_request_id,
+                         is_mutable,
+                         metadata,
+                         &retry_with_request_id,
+                         data);
   }
 
   return status;
@@ -528,12 +535,13 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
 
 Status PlasmaClient::Impl::RetryCreate(const ObjectID &object_id,
                                        uint64_t request_id,
+                                       bool is_mutable,
                                        const uint8_t *metadata,
                                        uint64_t *retry_with_request_id,
                                        std::shared_ptr<Buffer> *data) {
   std::lock_guard<std::recursive_mutex> guard(client_mutex_);
   RAY_RETURN_NOT_OK(SendCreateRetryRequest(store_conn_, object_id, request_id));
-  return HandleCreateReply(object_id, metadata, retry_with_request_id, data);
+  return HandleCreateReply(object_id, is_mutable, metadata, retry_with_request_id, data);
 }
 
 Status PlasmaClient::Impl::TryCreateImmediately(const ObjectID &object_id,
@@ -557,7 +565,7 @@ Status PlasmaClient::Impl::TryCreateImmediately(const ObjectID &object_id,
                                       source,
                                       device_num,
                                       /*try_immediately=*/true));
-  return HandleCreateReply(object_id, metadata, nullptr, data);
+  return HandleCreateReply(object_id, /*is_mutable=*/false, metadata, nullptr, data);
 }
 
 Status PlasmaClient::Impl::GetBuffers(

From e54972b99872450c902a3d97d9889a2f290f1457 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 6 Dec 2023 09:40:20 -0800
Subject: [PATCH 33/66] unit test

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/BUILD                         |  8 ++++
 python/ray/dag/tests/test_accelerated_dag.py | 50 ++++++++++++++++++++
 2 files changed, 58 insertions(+)
 create mode 100644 python/ray/dag/tests/test_accelerated_dag.py

diff --git a/python/ray/dag/BUILD b/python/ray/dag/BUILD
index 53e61563231ff..f5eea4e6155b5 100644
--- a/python/ray/dag/BUILD
+++ b/python/ray/dag/BUILD
@@ -60,3 +60,11 @@ py_test(
     tags = ["exclusive", "team:core", "ray_dag_tests"],
     deps = [":dag_lib"],
 )
+
+py_test(
+    name = "test_accelerated_dag",
+    size = "small",
+    srcs = dag_tests_srcs,
+    tags = ["exclusive", "team:core", "ray_dag_tests"],
+    deps = [":dag_lib"],
+)
diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py
new file mode 100644
index 0000000000000..b07491c1f18c6
--- /dev/null
+++ b/python/ray/dag/tests/test_accelerated_dag.py
@@ -0,0 +1,50 @@
+# coding: utf-8
+import logging
+import os
+import sys
+
+import numpy as np
+import pytest
+
+import ray
+import ray.cluster_utils
+import ray.experimental.channel as ray_channel
+from ray.dag import DAGNode, InputNode, OutputNode
+
+
+logger = logging.getLogger(__name__)
+
+
+@ray.remote(concurrency_groups={"_ray_system": 1})
+class Actor:
+    def __init__(self, init_value):
+        print("__init__ PID", os.getpid())
+        self.i = init_value
+
+    def inc(self, x):
+        self.i += x
+        return self.i
+
+
+@pytest.mark.parametrize("num_actors", [1, 4])
+def test_scatter_gather_dag(ray_start_regular, num_actors):
+    init_val = 0
+    actors = [Actor.remote(init_val) for _ in range(num_actors)]
+    with InputNode() as i:
+        out = [a.inc.bind(i) for a in actors]
+        dag = OutputNode(out)
+
+    for i in range(3):
+        output_channels = dag.execute(1, compiled=True)
+        # TODO(swang): Replace with fake ObjectRef.
+        results = [chan.begin_read() for chan in output_channels]
+        assert results == [init_val + i + 1] * num_actors
+        for chan in output_channels:
+            chan.end_read()
+
+
+if __name__ == "__main__":
+    if os.environ.get("PARALLEL_CI"):
+        sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))
+    else:
+        sys.exit(pytest.main(["-sv", __file__]))

From 881d5ff6aa807c40a381b4833c41d060914cbb86 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 6 Dec 2023 14:08:18 -0800
Subject: [PATCH 34/66] copyright

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 src/ray/object_manager/common.cc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/ray/object_manager/common.cc b/src/ray/object_manager/common.cc
index e35a2807942a8..cb3335f9cf8c9 100644
--- a/src/ray/object_manager/common.cc
+++ b/src/ray/object_manager/common.cc
@@ -1,3 +1,17 @@
+// Copyright 2020-2021 The Ray Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "ray/object_manager/common.h"
 
 namespace ray {

From ef2cfb7bf5376aed1801a30a75c97629cbda82ac Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 6 Dec 2023 14:18:54 -0800
Subject: [PATCH 35/66] test

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/tests/test_object_store_metrics.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/ray/tests/test_object_store_metrics.py b/python/ray/tests/test_object_store_metrics.py
index 60922a888dc47..7e72919e761b4 100644
--- a/python/ray/tests/test_object_store_metrics.py
+++ b/python/ray/tests/test_object_store_metrics.py
@@ -92,7 +92,7 @@ def test_shared_memory_and_inline_worker_heap(shutdown_only):
 
     wait_for_condition(
         # 1KiB for metadata difference
-        lambda: approx_eq_dict_in(objects_by_loc(info), expected, 1 * KiB),
+        lambda: approx_eq_dict_in(objects_by_loc(info), expected, 2 * KiB),
         timeout=20,
         retry_interval_ms=500,
     )
@@ -134,7 +134,7 @@ def func():
 
     wait_for_condition(
         # 1KiB for metadata difference
-        lambda: approx_eq_dict_in(objects_by_loc(info), expected, 1 * KiB),
+        lambda: approx_eq_dict_in(objects_by_loc(info), expected, 2 * KiB),
         timeout=20,
         retry_interval_ms=500,
     )
@@ -255,7 +255,7 @@ def test_fallback_memory(shutdown_only):
 
     wait_for_condition(
         # 2KiB for metadata difference
-        lambda: approx_eq_dict_in(objects_by_loc(info), expected, 2 * KiB),
+        lambda: approx_eq_dict_in(objects_by_loc(info), expected, 3 * KiB),
         timeout=20,
         retry_interval_ms=500,
     )
@@ -282,8 +282,8 @@ def test_fallback_memory(shutdown_only):
     }
 
     wait_for_condition(
-        # 1KiB for metadata difference
-        lambda: approx_eq_dict_in(objects_by_loc(info), expected, 2 * KiB),
+        # 3KiB for metadata difference
+        lambda: approx_eq_dict_in(objects_by_loc(info), expected, 3 * KiB),
         timeout=20,
         retry_interval_ms=500,
     )
@@ -302,8 +302,8 @@ def test_fallback_memory(shutdown_only):
     }
 
     wait_for_condition(
-        # 1KiB for metadata difference
-        lambda: approx_eq_dict_in(objects_by_loc(info), expected, 2 * KiB),
+        # 3KiB for metadata difference
+        lambda: approx_eq_dict_in(objects_by_loc(info), expected, 3 * KiB),
         timeout=20,
         retry_interval_ms=500,
     )
@@ -333,7 +333,7 @@ def test_seal_memory(shutdown_only):
 
     wait_for_condition(
         # 1KiB for metadata difference
-        lambda: approx_eq_dict_in(objects_by_seal_state(info), expected, 1 * KiB),
+        lambda: approx_eq_dict_in(objects_by_seal_state(info), expected, 2 * KiB),
         timeout=20,
         retry_interval_ms=500,
     )
@@ -347,7 +347,7 @@ def test_seal_memory(shutdown_only):
 
     wait_for_condition(
         # 1KiB for metadata difference
-        lambda: approx_eq_dict_in(objects_by_seal_state(info), expected, 1 * KiB),
+        lambda: approx_eq_dict_in(objects_by_seal_state(info), expected, 2 * KiB),
         timeout=20,
         retry_interval_ms=500,
     )

From 93968107d6de00336ca5af23a3c95e9f655ddc89 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 6 Dec 2023 14:55:32 -0800
Subject: [PATCH 36/66] tmp

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/_raylet.pyx             | 7 +++++++
 python/ray/experimental/channel.py | 4 ++++
 src/ray/core_worker/core_worker.cc | 4 ++++
 3 files changed, 15 insertions(+)

diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index 6495d46a57f7c..5044f09ab388e 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -3508,6 +3508,13 @@ cdef class CoreWorker:
                         generator_id=CObjectID.Nil(),
                         owner_address=null_owner_address))
 
+    def experimental_mutable_object_get_current_output_ref(self, ObjectRef channel_ref):
+        cdef:
+            CObjectID c_channel_ref = channel_ref
+        with nogil:
+            return (CCoreWorkerProcess.GetCoreWorker()
+                         .ExperimentalMutableObjectGetCurrentOutputRef(c_channel_ref))
+
     def experimental_mutable_object_read_release(self, object_refs):
         """
         For experimental.channel.Channel.
diff --git a/python/ray/experimental/channel.py b/python/ray/experimental/channel.py
index e8ef9ad085f79..11f5e36053822 100644
--- a/python/ray/experimental/channel.py
+++ b/python/ray/experimental/channel.py
@@ -120,6 +120,10 @@ def write(self, value: Any, num_readers: Optional[int] = None):
             num_readers,
         )
 
+    def get_current_output_ref(self) -> "ray.ObjectRef":
+        return self._worker.core_worker.experimental_mutable_object_get_current_output_ref(
+                self._base_ref)
+
     def begin_read(self) -> Any:
         """
         Read the latest value from the channel. This call will block until a
diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc
index 57000d2b3abbc..cd0d5a38e5607 100644
--- a/src/ray/core_worker/core_worker.cc
+++ b/src/ray/core_worker/core_worker.cc
@@ -1307,6 +1307,10 @@ Status CoreWorker::CreateOwnedAndIncrementLocalRef(
           memory_store_->Put(RayObject(rpc::ErrorType::OBJECT_IN_PLASMA), *object_id));
     }
   }
+
+  if (is_experimental_mutable_object) {
+    RegisterChannel(*object_id);
+  }
   return Status::OK();
 }
 

From dbbb3d6b89927eb8f1fbcc78c0b60317f0b58d89 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 6 Dec 2023 16:39:31 -0800
Subject: [PATCH 37/66] Only allocate PlasmaObjectHeader if is_mutable=true

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 src/ray/object_manager/common.h               |  4 ++--
 src/ray/object_manager/plasma/common.h        | 10 +++++++---
 src/ray/object_manager/plasma/object_store.cc | 20 +++++++++----------
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h
index 4a4d8404a94d5..e3e8381de9330 100644
--- a/src/ray/object_manager/common.h
+++ b/src/ray/object_manager/common.h
@@ -141,7 +141,7 @@ struct PlasmaObjectHeader {
 /// A struct that includes info about the object.
 struct ObjectInfo {
   ObjectID object_id;
-  bool is_mutable;
+  bool is_mutable = false;
   int64_t data_size = 0;
   int64_t metadata_size = 0;
   /// Owner's raylet ID.
@@ -154,7 +154,7 @@ struct ObjectInfo {
   WorkerID owner_worker_id;
 
   int64_t GetObjectSize() const {
-    return sizeof(PlasmaObjectHeader) + data_size + metadata_size;
+    return data_size + metadata_size + (is_mutable ? sizeof(PlasmaObjectHeader) : 0);
   }
 
   bool operator==(const ObjectInfo &other) const {
diff --git a/src/ray/object_manager/plasma/common.h b/src/ray/object_manager/plasma/common.h
index d24a110c32e44..414af54b4a544 100644
--- a/src/ray/object_manager/plasma/common.h
+++ b/src/ray/object_manager/plasma/common.h
@@ -124,6 +124,7 @@ class LocalObject {
   const plasma::flatbuf::ObjectSource &GetSource() const { return source; }
 
   ray::PlasmaObjectHeader *GetPlasmaObjectHeader() const {
+    RAY_CHECK(object_info.is_mutable) << "Object is not mutable";
     auto header_ptr = static_cast<uint8_t *>(allocation.address);
     return reinterpret_cast<ray::PlasmaObjectHeader *>(header_ptr);
   }
@@ -135,9 +136,12 @@ class LocalObject {
     }
     object->store_fd = GetAllocation().fd;
     object->header_offset = GetAllocation().offset;
-    object->data_offset = GetAllocation().offset + sizeof(ray::PlasmaObjectHeader);
-    object->metadata_offset = GetAllocation().offset + sizeof(ray::PlasmaObjectHeader) +
-                              GetObjectInfo().data_size;
+    object->data_offset = GetAllocation().offset;
+    object->metadata_offset = GetAllocation().offset + GetObjectInfo().data_size;
+    if (object_info.is_mutable) {
+      object->data_offset += sizeof(ray::PlasmaObjectHeader);
+      object->metadata_offset += sizeof(ray::PlasmaObjectHeader);
+    };
     object->data_size = GetObjectInfo().data_size;
     object->metadata_size = GetObjectInfo().metadata_size;
     // Senders and receivers of a channel may store different data and metadata
diff --git a/src/ray/object_manager/plasma/object_store.cc b/src/ray/object_manager/plasma/object_store.cc
index a9324ad39d90f..4262a282f3fa9 100644
--- a/src/ray/object_manager/plasma/object_store.cc
+++ b/src/ray/object_manager/plasma/object_store.cc
@@ -47,9 +47,11 @@ const LocalObject *ObjectStore::CreateObject(const ray::ObjectInfo &object_info,
   entry->construct_duration = -1;
   entry->source = source;
 
-  auto plasma_header = entry->GetPlasmaObjectHeader();
-  *plasma_header = ray::PlasmaObjectHeader{};
-  plasma_header->Init();
+  if (object_info.is_mutable) {
+    auto plasma_header = entry->GetPlasmaObjectHeader();
+    *plasma_header = ray::PlasmaObjectHeader{};
+    plasma_header->Init();
+  }
 
   RAY_LOG(DEBUG) << "create object " << object_info.object_id << " succeeded";
   return entry;
@@ -70,12 +72,6 @@ const LocalObject *ObjectStore::SealObject(const ObjectID &object_id) {
   }
   entry->state = ObjectState::PLASMA_SEALED;
   entry->construct_duration = std::time(nullptr) - entry->create_time;
-  auto plasma_header = entry->GetPlasmaObjectHeader();
-#ifndef _WIN32
-  if (!entry->object_info.is_mutable) {
-    RAY_CHECK(plasma_header->num_readers == -1) << plasma_header->num_readers;
-  }
-#endif
   return entry;
 }
 
@@ -84,8 +80,10 @@ bool ObjectStore::DeleteObject(const ObjectID &object_id) {
   if (entry == nullptr) {
     return false;
   }
-  auto plasma_header = entry->GetPlasmaObjectHeader();
-  plasma_header->Destroy();
+  if (entry->object_info.is_mutable) {
+    auto plasma_header = entry->GetPlasmaObjectHeader();
+    plasma_header->Destroy();
+  }
 
   allocator_.Free(std::move(entry->allocation));
   object_table_.erase(object_id);

From 9078776a704335e4646e48c565f6be4a0eb8382c Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 6 Dec 2023 18:34:35 -0800
Subject: [PATCH 38/66] Only call Read/Write Acquire/Release if is_mutable=true

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/_raylet.pyx                        |   9 +-
 python/ray/includes/libcoreworker.pxd         |   2 +
 src/ray/core_worker/core_worker.cc            |   4 +
 src/ray/core_worker/core_worker.h             |   7 +
 .../store_provider/plasma_store_provider.cc   |   5 +
 .../store_provider/plasma_store_provider.h    |   7 +
 src/ray/object_manager/plasma/client.cc       | 174 +++++++++---------
 src/ray/object_manager/plasma/client.h        |   9 +
 src/ray/object_manager/plasma/common.h        |   1 +
 src/ray/object_manager/plasma/plasma.fbs      |   3 +
 src/ray/object_manager/plasma/plasma.h        |   1 +
 src/ray/object_manager/plasma/protocol.cc     |  10 +-
 12 files changed, 137 insertions(+), 95 deletions(-)

diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index 6495d46a57f7c..595986851ed7c 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -3502,11 +3502,10 @@ cdef class CoreWorker:
         if data_size > 0:
             (<SerializedObject>serialized_object).write_to(
                 Buffer.make(data))
-        check_status(
-            CCoreWorkerProcess.GetCoreWorker().SealExisting(
-                        c_object_id, pin_object=False,
-                        generator_id=CObjectID.Nil(),
-                        owner_address=null_owner_address))
+        check_status(CCoreWorkerProcess.GetCoreWorker()
+                     .ExperimentalMutableObjectWriteRelease(
+                         c_object_id,
+                         ))
 
     def experimental_mutable_object_read_release(self, object_refs):
         """
diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd
index 00bf9b5f9d4e6..ab2fb6c1f8f5a 100644
--- a/python/ray/includes/libcoreworker.pxd
+++ b/python/ray/includes/libcoreworker.pxd
@@ -246,6 +246,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
                                   uint64_t data_size,
                                   int64_t num_readers,
                                   shared_ptr[CBuffer] *data)
+        CRayStatus ExperimentalMutableObjectWriteRelease(
+                                  const CObjectID &object_id)
         CRayStatus SealOwned(const CObjectID &object_id, c_bool pin_object,
                              const unique_ptr[CAddress] &owner_address)
         CRayStatus SealExisting(const CObjectID &object_id, c_bool pin_object,
diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc
index 57000d2b3abbc..ce1ebe57de8e5 100644
--- a/src/ray/core_worker/core_worker.cc
+++ b/src/ray/core_worker/core_worker.cc
@@ -1336,6 +1336,10 @@ Status CoreWorker::ExperimentalMutableObjectWriteAcquire(
       object_id, metadata, data_size, num_readers, data);
 }
 
+Status CoreWorker::ExperimentalMutableObjectWriteRelease(const ObjectID &object_id) {
+  return plasma_store_provider_->ExperimentalMutableObjectWriteRelease(object_id);
+}
+
 Status CoreWorker::SealOwned(const ObjectID &object_id,
                              bool pin_object,
                              const std::unique_ptr<rpc::Address> &owner_address) {
diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h
index ea01d202fba75..606868b380f29 100644
--- a/src/ray/core_worker/core_worker.h
+++ b/src/ray/core_worker/core_worker.h
@@ -698,6 +698,13 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
                                                int64_t num_readers,
                                                std::shared_ptr<Buffer> *data);
 
+  /// Experimental method for mutable objects. Releases a write lock on the
+  /// object, allowing readers to read. This is the equivalent of "Seal" for
+  /// normal objects.
+  ///
+  /// \param[in] object_id The ID of the object.
+  Status ExperimentalMutableObjectWriteRelease(const ObjectID &object_id);
+
   /// Experimental method for mutable objects. Releases the objects, allowing them
   /// to be written again. If the caller did not previously Get the objects,
   /// then this first blocks until the latest value is available to read, then
diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.cc b/src/ray/core_worker/store_provider/plasma_store_provider.cc
index 9d22c256355fc..d4c97b9e0ef01 100644
--- a/src/ray/core_worker/store_provider/plasma_store_provider.cc
+++ b/src/ray/core_worker/store_provider/plasma_store_provider.cc
@@ -123,6 +123,11 @@ Status CoreWorkerPlasmaStoreProvider::ExperimentalMutableObjectWriteAcquire(
       data);
 }
 
+Status CoreWorkerPlasmaStoreProvider::ExperimentalMutableObjectWriteRelease(
+    const ObjectID &object_id) {
+  return store_client_.ExperimentalMutableObjectWriteRelease(object_id);
+}
+
 Status CoreWorkerPlasmaStoreProvider::Create(const std::shared_ptr<Buffer> &metadata,
                                              const size_t data_size,
                                              const ObjectID &object_id,
diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.h b/src/ray/core_worker/store_provider/plasma_store_provider.h
index fff93c48c2e4a..3656655623828 100644
--- a/src/ray/core_worker/store_provider/plasma_store_provider.h
+++ b/src/ray/core_worker/store_provider/plasma_store_provider.h
@@ -200,6 +200,13 @@ class CoreWorkerPlasmaStoreProvider {
                                                int64_t num_readers,
                                                std::shared_ptr<Buffer> *data);
 
+  /// Experimental method for mutable objects. Releases a write lock on the
+  /// object, allowing readers to read. This is the equivalent of "Seal" for
+  /// normal objects.
+  ///
+  /// \param[in] object_id The ID of the object.
+  Status ExperimentalMutableObjectWriteRelease(const ObjectID &object_id);
+
   /// Experimental method for mutable objects. Releases the objects, allowing them
   /// to be written again. If the caller did not previously Get the objects,
   /// then this first blocks until the latest value is available to read, then
diff --git a/src/ray/object_manager/plasma/client.cc b/src/ray/object_manager/plasma/client.cc
index d06f073c91a01..36ea5ba8bd3d1 100644
--- a/src/ray/object_manager/plasma/client.cc
+++ b/src/ray/object_manager/plasma/client.cc
@@ -98,11 +98,6 @@ struct ObjectInUseEntry {
 
   /// The below fields are experimental and used to implement
   /// ray.experimental.channel.
-  ///
-  /// Whether the object is mutable. Most objects are immutable and cannot be
-  /// written to after the initial Create and Seal call. Mutable objects are
-  /// used to implement ray.experimental.channel.
-  bool is_mutable = false;
   /// Whether we are the writer. For now, only the original creator of the
   /// mutable object may write to it.
   bool is_writer = false;
@@ -136,7 +131,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
 
   Status CreateAndSpillIfNeeded(const ObjectID &object_id,
                                 const ray::rpc::Address &owner_address,
-                                bool is_mutable,
+                                bool is_experimental_mutable_object,
                                 int64_t data_size,
                                 const uint8_t *metadata,
                                 int64_t metadata_size,
@@ -146,7 +141,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
 
   Status RetryCreate(const ObjectID &object_id,
                      uint64_t request_id,
-                     bool is_mutable,
+                     bool is_experimental_mutable_object,
                      const uint8_t *metadata,
                      uint64_t *retry_with_request_id,
                      std::shared_ptr<Buffer> *data);
@@ -167,6 +162,8 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
                                                int64_t num_readers,
                                                std::shared_ptr<Buffer> *data);
 
+  Status ExperimentalMutableObjectWriteRelease(const ObjectID &object_id);
+
   Status Get(const std::vector<ObjectID> &object_ids,
              int64_t timeout_ms,
              std::vector<ObjectBuffer> *object_buffers,
@@ -205,7 +202,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
  private:
   /// Helper method to read and process the reply of a create request.
   Status HandleCreateReply(const ObjectID &object_id,
-                           bool is_mutable,
+                           bool is_experimental_mutable_object,
                            const uint8_t *metadata,
                            uint64_t *retry_with_request_id,
                            std::shared_ptr<Buffer> *data);
@@ -235,13 +232,11 @@ class PlasmaClient::Impl : public std::enable_shared_from_this<PlasmaClient::Imp
 
   uint8_t *LookupMmappedFile(MEMFD_TYPE store_fd_val) const;
 
-#ifndef _WIN32
   ray::PlasmaObjectHeader *GetPlasmaObjectHeader(const PlasmaObject &object) const {
     auto base_ptr = LookupMmappedFile(object.store_fd);
     auto header_ptr = base_ptr + object.header_offset;
     return reinterpret_cast<ray::PlasmaObjectHeader *>(header_ptr);
   }
-#endif
 
   void InsertObjectInUse(const ObjectID &object_id,
                          std::unique_ptr<PlasmaObject> object,
@@ -343,7 +338,7 @@ void PlasmaClient::Impl::IncrementObjectCount(const ObjectID &object_id) {
 }
 
 Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id,
-                                             bool is_mutable,
+                                             bool is_experimental_mutable_object,
                                              const uint8_t *metadata,
                                              uint64_t *retry_with_request_id,
                                              std::shared_ptr<Buffer> *data) {
@@ -409,25 +404,7 @@ Status PlasmaClient::Impl::HandleCreateReply(const ObjectID &object_id,
   RAY_CHECK(object_entry != objects_in_use_.end());
   auto &entry = object_entry->second;
   RAY_CHECK(!entry->is_sealed);
-  entry->is_mutable = is_mutable;
-
-#ifndef _WIN32
-  auto plasma_header = GetPlasmaObjectHeader(entry->object);
-  if (entry->is_mutable) {
-    entry->is_writer = true;
-  } else {
-    // The first creation's version is always 1.
-    RAY_CHECK(entry->next_version_to_write == 1);
-    // The corresponding WriteRelease takes place in Seal.
-    // When an object is first created, the data size is equivalent to
-    // buffer size.
-    plasma_header->WriteAcquire(entry->next_version_to_write,
-                                entry->object.data_size,
-                                entry->object.metadata_size,
-                                // Anyone may read an immutable object.
-                                /*num_readers=*/-1);
-  }
-#endif
+  entry->is_writer = true;
 
   return Status::OK();
 }
@@ -453,7 +430,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire(
   RAY_CHECK(object_entry != objects_in_use_.end());
 
   auto &entry = object_entry->second;
-  RAY_CHECK(entry->is_mutable);
+  RAY_CHECK(entry->object.is_experimental_mutable_object);
   RAY_CHECK(entry->is_sealed) << "Must Seal before writing again to a mutable object";
 
   RAY_LOG(DEBUG) << "Write mutable object " << object_id;
@@ -488,9 +465,39 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectWriteAcquire(
   return Status::OK();
 }
 
+Status PlasmaClient::Impl::ExperimentalMutableObjectWriteRelease(
+    const ObjectID &object_id) {
+#ifndef _WIN32
+  std::unique_lock<std::recursive_mutex> guard(client_mutex_);
+  auto object_entry = objects_in_use_.find(object_id);
+  if (object_entry == objects_in_use_.end()) {
+    return Status::Invalid(
+        "Plasma buffer for mutable object not in scope. Are you sure you're the writer?");
+  }
+  if (!object_entry->second->is_writer) {
+    return Status::Invalid(
+        "Mutable objects can only be written by the original creator process.");
+  }
+  RAY_CHECK(object_entry != objects_in_use_.end());
+
+  auto &entry = object_entry->second;
+  RAY_CHECK(entry->object.is_experimental_mutable_object);
+  RAY_CHECK(!entry->is_sealed)
+      << "Must WriteAcquire before WriteRelease on a mutable object";
+
+  entry->is_sealed = true;
+  auto plasma_header = GetPlasmaObjectHeader(entry->object);
+  plasma_header->WriteRelease(
+      /*write_version=*/entry->next_version_to_write);
+  // The next Write must pass a higher version.
+  entry->next_version_to_write++;
+#endif
+  return Status::OK();
+}
+
 Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
                                                   const ray::rpc::Address &owner_address,
-                                                  bool is_mutable,
+                                                  bool is_experimental_mutable_object,
                                                   int64_t data_size,
                                                   const uint8_t *metadata,
                                                   int64_t metadata_size,
@@ -505,14 +512,14 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
   RAY_RETURN_NOT_OK(SendCreateRequest(store_conn_,
                                       object_id,
                                       owner_address,
-                                      is_mutable,
+                                      is_experimental_mutable_object,
                                       data_size,
                                       metadata_size,
                                       source,
                                       device_num,
                                       /*try_immediately=*/false));
-  Status status =
-      HandleCreateReply(object_id, is_mutable, metadata, &retry_with_request_id, data);
+  Status status = HandleCreateReply(
+      object_id, is_experimental_mutable_object, metadata, &retry_with_request_id, data);
 
   while (retry_with_request_id > 0) {
     guard.unlock();
@@ -524,7 +531,7 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
                    << retry_with_request_id;
     status = RetryCreate(object_id,
                          retry_with_request_id,
-                         is_mutable,
+                         is_experimental_mutable_object,
                          metadata,
                          &retry_with_request_id,
                          data);
@@ -535,13 +542,14 @@ Status PlasmaClient::Impl::CreateAndSpillIfNeeded(const ObjectID &object_id,
 
 Status PlasmaClient::Impl::RetryCreate(const ObjectID &object_id,
                                        uint64_t request_id,
-                                       bool is_mutable,
+                                       bool is_experimental_mutable_object,
                                        const uint8_t *metadata,
                                        uint64_t *retry_with_request_id,
                                        std::shared_ptr<Buffer> *data) {
   std::lock_guard<std::recursive_mutex> guard(client_mutex_);
   RAY_RETURN_NOT_OK(SendCreateRetryRequest(store_conn_, object_id, request_id));
-  return HandleCreateReply(object_id, is_mutable, metadata, retry_with_request_id, data);
+  return HandleCreateReply(
+      object_id, is_experimental_mutable_object, metadata, retry_with_request_id, data);
 }
 
 Status PlasmaClient::Impl::TryCreateImmediately(const ObjectID &object_id,
@@ -559,13 +567,14 @@ Status PlasmaClient::Impl::TryCreateImmediately(const ObjectID &object_id,
   RAY_RETURN_NOT_OK(SendCreateRequest(store_conn_,
                                       object_id,
                                       owner_address,
-                                      /*is_mutable=*/false,
+                                      /*is_experimental_mutable_object=*/false,
                                       data_size,
                                       metadata_size,
                                       source,
                                       device_num,
                                       /*try_immediately=*/true));
-  return HandleCreateReply(object_id, /*is_mutable=*/false, metadata, nullptr, data);
+  return HandleCreateReply(
+      object_id, /*is_experimental_mutable_object=*/false, metadata, nullptr, data);
 }
 
 Status PlasmaClient::Impl::GetBuffers(
@@ -594,8 +603,10 @@ Status PlasmaClient::Impl::GetBuffers(
           << "Attempting to get an object that this client created but hasn't sealed.";
       all_present = false;
     } else {
-      // Wait for the object to become ready to read.
-      RAY_RETURN_NOT_OK(EnsureGetAcquired(object_entry->second));
+      if (object_entry->second->object.is_experimental_mutable_object) {
+        // Wait for the object to become ready to read.
+        RAY_RETURN_NOT_OK(EnsureGetAcquired(object_entry->second));
+      }
 
       PlasmaObject *object = &object_entry->second->object;
 
@@ -679,7 +690,7 @@ Status PlasmaClient::Impl::GetBuffers(
       auto &object_entry = objects_in_use_[received_object_ids[i]];
 
       // Wait for the object to become ready to read.
-      if (!object_entry->read_acquired) {
+      if (object_entry->object.is_experimental_mutable_object) {
         RAY_RETURN_NOT_OK(EnsureGetAcquired(object_entry));
       }
       std::shared_ptr<Buffer> physical_buf;
@@ -733,6 +744,7 @@ Status PlasmaClient::Impl::EnsureGetAcquired(
     std::unique_ptr<ObjectInUseEntry> &object_entry) {
 #ifndef _WIN32
   PlasmaObject *object = &object_entry->object;
+  RAY_CHECK(object->is_experimental_mutable_object);
   auto plasma_header = GetPlasmaObjectHeader(*object);
   if (object_entry->read_acquired) {
     return Status::OK();
@@ -747,19 +759,17 @@ Status PlasmaClient::Impl::EnsureGetAcquired(
   }
 
   object_entry->read_acquired = true;
-  if (version_read > 0) {
-    object_entry->is_mutable = true;
-    object_entry->next_version_to_read = version_read;
-
-    // The data and metadata size may have changed, so update here before we
-    // create the Get buffer to return.
-    object_entry->object.data_size = plasma_header->data_size;
-    object_entry->object.metadata_size = plasma_header->metadata_size;
-    object_entry->object.metadata_offset =
-        object_entry->object.data_offset + object_entry->object.data_size;
-    RAY_CHECK(object_entry->object.data_size + object_entry->object.metadata_size <=
-              object_entry->object.allocated_size);
-  }
+  RAY_CHECK(version_read > 0);
+  object_entry->next_version_to_read = version_read;
+
+  // The data and metadata size may have changed, so update here before we
+  // create the Get buffer to return.
+  object_entry->object.data_size = plasma_header->data_size;
+  object_entry->object.metadata_size = plasma_header->metadata_size;
+  object_entry->object.metadata_offset =
+      object_entry->object.data_offset + object_entry->object.data_size;
+  RAY_CHECK(object_entry->object.data_size + object_entry->object.metadata_size <=
+            object_entry->object.allocated_size);
 #endif
   return Status::OK();
 }
@@ -780,7 +790,7 @@ Status PlasmaClient::Impl::ExperimentalMutableObjectReadRelease(
   if (!entry->is_sealed) {
     return Status::ObjectNotFound("ray.release() called on an object that is not sealed");
   }
-  if (!entry->is_mutable) {
+  if (!entry->object.is_experimental_mutable_object) {
     return Status::ObjectNotFound(
         "ray.release() called on an object that is not mutable");
   }
@@ -816,7 +826,7 @@ Status PlasmaClient::Impl::Release(const ObjectID &object_id) {
   const auto object_entry = objects_in_use_.find(object_id);
   RAY_CHECK(object_entry != objects_in_use_.end());
 
-  if (!object_entry->second->is_mutable) {
+  if (!object_entry->second->object.is_experimental_mutable_object) {
     // Release only applies to immutable objects.
     // TODO(swang): Add a delete call to properly clean up mutable objects.
     object_entry->second->count -= 1;
@@ -894,32 +904,20 @@ Status PlasmaClient::Impl::Seal(const ObjectID &object_id) {
   }
 
   object_entry->second->is_sealed = true;
-#ifndef _WIN32
-  auto plasma_header = GetPlasmaObjectHeader(object_entry->second->object);
-  plasma_header->WriteRelease(
-      /*write_version=*/object_entry->second->next_version_to_write);
-  // The next Write must pass a higher version.
-  object_entry->second->next_version_to_write++;
-
-  if (plasma_header->num_readers <= 0) {
-#else
-  {
-#endif
-    // Send the seal request to Plasma. This is the normal Seal path, used for
-    // immutable objects and the initial Create call for mutable objects.
-    RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id));
-    std::vector<uint8_t> buffer;
-    RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer));
-    ObjectID sealed_id;
-    RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id));
-    RAY_CHECK(sealed_id == object_id);
-    // We call PlasmaClient::Release to decrement the number of instances of this
-    // object
-    // that are currently being used by this client. The corresponding increment
-    // happened in plasma_create and was used to ensure that the object was not
-    // released before the call to PlasmaClient::Seal.
-    RAY_RETURN_NOT_OK(Release(object_id));
-  }
+  // Send the seal request to Plasma. This is the normal Seal path, used for
+  // immutable objects and the initial Create call for mutable objects.
+  RAY_RETURN_NOT_OK(SendSealRequest(store_conn_, object_id));
+  std::vector<uint8_t> buffer;
+  RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaSealReply, &buffer));
+  ObjectID sealed_id;
+  RAY_RETURN_NOT_OK(ReadSealReply(buffer.data(), buffer.size(), &sealed_id));
+  RAY_CHECK(sealed_id == object_id);
+  // We call PlasmaClient::Release to decrement the number of instances of this
+  // object
+  // that are currently being used by this client. The corresponding increment
+  // happened in plasma_create and was used to ensure that the object was not
+  // released before the call to PlasmaClient::Seal.
+  RAY_RETURN_NOT_OK(Release(object_id));
 
   return Status::OK();
 }
@@ -1061,9 +1059,13 @@ Status PlasmaClient::ExperimentalMutableObjectWriteAcquire(
       object_id, data_size, metadata, metadata_size, num_readers, data);
 }
 
+Status PlasmaClient::ExperimentalMutableObjectWriteRelease(const ObjectID &object_id) {
+  return impl_->ExperimentalMutableObjectWriteRelease(object_id);
+}
+
 Status PlasmaClient::CreateAndSpillIfNeeded(const ObjectID &object_id,
                                             const ray::rpc::Address &owner_address,
-                                            bool is_mutable,
+                                            bool is_experimental_mutable_object,
                                             int64_t data_size,
                                             const uint8_t *metadata,
                                             int64_t metadata_size,
@@ -1072,7 +1074,7 @@ Status PlasmaClient::CreateAndSpillIfNeeded(const ObjectID &object_id,
                                             int device_num) {
   return impl_->CreateAndSpillIfNeeded(object_id,
                                        owner_address,
-                                       is_mutable,
+                                       is_experimental_mutable_object,
                                        data_size,
                                        metadata,
                                        metadata_size,
diff --git a/src/ray/object_manager/plasma/client.h b/src/ray/object_manager/plasma/client.h
index e3f1aa1b05e3f..bf9b9099f39b5 100644
--- a/src/ray/object_manager/plasma/client.h
+++ b/src/ray/object_manager/plasma/client.h
@@ -103,6 +103,13 @@ class PlasmaClientInterface {
                                                        int64_t num_readers,
                                                        std::shared_ptr<Buffer> *data) = 0;
 
+  /// Experimental method for mutable objects. Releases a write lock on the
+  /// object, allowing readers to read. This is the equivalent of "Seal" for
+  /// normal objects.
+  ///
+  /// \param[in] object_id The ID of the object.
+  virtual Status ExperimentalMutableObjectWriteRelease(const ObjectID &object_id) = 0;
+
   /// Experimental method for mutable objects. Releases the objects, allowing them
   /// to be written again. If the caller did not previously Get the objects,
   /// then this first blocks until the latest value is available to read, then
@@ -238,6 +245,8 @@ class PlasmaClient : public PlasmaClientInterface {
                                                int64_t num_readers,
                                                std::shared_ptr<Buffer> *data);
 
+  Status ExperimentalMutableObjectWriteRelease(const ObjectID &object_id);
+
   /// Create an object in the Plasma Store. Any metadata for this object must be
   /// be passed in when the object is created.
   ///
diff --git a/src/ray/object_manager/plasma/common.h b/src/ray/object_manager/plasma/common.h
index 414af54b4a544..6ffb86fdeb761 100644
--- a/src/ray/object_manager/plasma/common.h
+++ b/src/ray/object_manager/plasma/common.h
@@ -150,6 +150,7 @@ class LocalObject {
     object->allocated_size = object->data_size + object->metadata_size;
     object->device_num = GetAllocation().device_num;
     object->mmap_size = GetAllocation().mmap_size;
+    object->is_experimental_mutable_object = object_info.is_mutable;
   }
 
  private:
diff --git a/src/ray/object_manager/plasma/plasma.fbs b/src/ray/object_manager/plasma/plasma.fbs
index 0c4f8ac66f48b..317e0aad4846a 100644
--- a/src/ray/object_manager/plasma/plasma.fbs
+++ b/src/ray/object_manager/plasma/plasma.fbs
@@ -112,6 +112,9 @@ struct PlasmaObjectSpec {
   allocated_size: ulong;
   // Device to create buffer on.
   device_num: int;
+  // Whether this is an experimental mutable object that can be written
+  // multiple times by a client.
+  is_experimental_mutable_object: bool;
 }
 
 table PlasmaGetDebugStringRequest {
diff --git a/src/ray/object_manager/plasma/plasma.h b/src/ray/object_manager/plasma/plasma.h
index bb21f394d5b0c..7b1367181ad34 100644
--- a/src/ray/object_manager/plasma/plasma.h
+++ b/src/ray/object_manager/plasma/plasma.h
@@ -55,6 +55,7 @@ struct PlasmaObject {
   int device_num;
   /// Set if device_num is equal to 0.
   int64_t mmap_size;
+  bool is_experimental_mutable_object = false;
 
   bool operator==(const PlasmaObject &other) const {
     return ((store_fd == other.store_fd) && (data_offset == other.data_offset) &&
diff --git a/src/ray/object_manager/plasma/protocol.cc b/src/ray/object_manager/plasma/protocol.cc
index c041486bdefec..fd880daa7c064 100644
--- a/src/ray/object_manager/plasma/protocol.cc
+++ b/src/ray/object_manager/plasma/protocol.cc
@@ -200,7 +200,7 @@ Status SendCreateRetryRequest(const std::shared_ptr<StoreConn> &store_conn,
 Status SendCreateRequest(const std::shared_ptr<StoreConn> &store_conn,
                          ObjectID object_id,
                          const ray::rpc::Address &owner_address,
-                         bool is_mutable,
+                         bool is_experimental_mutable_object,
                          int64_t data_size,
                          int64_t metadata_size,
                          flatbuf::ObjectSource source,
@@ -214,7 +214,7 @@ Status SendCreateRequest(const std::shared_ptr<StoreConn> &store_conn,
                                     fbb.CreateString(owner_address.ip_address()),
                                     owner_address.port(),
                                     fbb.CreateString(owner_address.worker_id()),
-                                    is_mutable,
+                                    is_experimental_mutable_object,
                                     data_size,
                                     metadata_size,
                                     source,
@@ -269,7 +269,8 @@ Status SendCreateReply(const std::shared_ptr<Client> &client,
                                  object.metadata_offset,
                                  object.metadata_size,
                                  object.allocated_size,
-                                 object.device_num);
+                                 object.device_num,
+                                 object.is_experimental_mutable_object);
   auto object_string = fbb.CreateString(object_id.Binary());
   fb::PlasmaCreateReplyBuilder crb(fbb);
   crb.add_error(static_cast<PlasmaError>(error_code));
@@ -627,7 +628,8 @@ Status SendGetReply(const std::shared_ptr<Client> &client,
                                        object.metadata_offset,
                                        object.metadata_size,
                                        object.allocated_size,
-                                       object.device_num));
+                                       object.device_num,
+                                       object.is_experimental_mutable_object));
   }
   std::vector<int> store_fds_as_int;
   std::vector<int64_t> unique_fd_ids;

From 2e677c34c6c3c181444f1c857083d894d064cbb6 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 6 Dec 2023 21:49:28 -0800
Subject: [PATCH 39/66] x

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 src/ray/object_manager/plasma/protocol.cc              | 4 ++++
 src/ray/object_manager/test/object_buffer_pool_test.cc | 1 +
 2 files changed, 5 insertions(+)

diff --git a/src/ray/object_manager/plasma/protocol.cc b/src/ray/object_manager/plasma/protocol.cc
index fd880daa7c064..771cb7087cd35 100644
--- a/src/ray/object_manager/plasma/protocol.cc
+++ b/src/ray/object_manager/plasma/protocol.cc
@@ -312,6 +312,8 @@ Status ReadCreateReply(uint8_t *data,
   object->metadata_offset = message->plasma_object()->metadata_offset();
   object->metadata_size = message->plasma_object()->metadata_size();
   object->allocated_size = message->plasma_object()->allocated_size();
+  object->is_experimental_mutable_object =
+      message->plasma_object()->is_experimental_mutable_object();
 
   store_fd->first = INT2FD(message->store_fd());
   store_fd->second = message->unique_fd_id();
@@ -672,6 +674,8 @@ Status ReadGetReply(uint8_t *data,
     plasma_objects[i].metadata_size = object->metadata_size();
     plasma_objects[i].allocated_size = object->allocated_size();
     plasma_objects[i].device_num = object->device_num();
+    plasma_objects[i].is_experimental_mutable_object =
+        object->is_experimental_mutable_object();
   }
   RAY_CHECK(message->store_fds()->size() == message->mmap_sizes()->size());
   for (uoffset_t i = 0; i < message->store_fds()->size(); i++) {
diff --git a/src/ray/object_manager/test/object_buffer_pool_test.cc b/src/ray/object_manager/test/object_buffer_pool_test.cc
index 1ae4602f06acd..c3568d65324e5 100644
--- a/src/ray/object_manager/test/object_buffer_pool_test.cc
+++ b/src/ray/object_manager/test/object_buffer_pool_test.cc
@@ -53,6 +53,7 @@ class MockPlasmaClient : public plasma::PlasmaClientInterface {
 
   ray::Status CreateAndSpillIfNeeded(const ObjectID &object_id,
                                      const ray::rpc::Address &owner_address,
+                                     bool is_experimental_mutable_object,
                                      int64_t data_size,
                                      const uint8_t *metadata,
                                      int64_t metadata_size,

From f06b543f8f5cf7d1c698fbb9361fde3f68f3b8d2 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Thu, 7 Dec 2023 09:25:14 -0800
Subject: [PATCH 40/66] cpp test

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 src/ray/object_manager/plasma/client.h             |  4 ++--
 .../object_manager/test/object_buffer_pool_test.cc | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/ray/object_manager/plasma/client.h b/src/ray/object_manager/plasma/client.h
index bf9b9099f39b5..d50d1b8c5de9e 100644
--- a/src/ray/object_manager/plasma/client.h
+++ b/src/ray/object_manager/plasma/client.h
@@ -247,6 +247,8 @@ class PlasmaClient : public PlasmaClientInterface {
 
   Status ExperimentalMutableObjectWriteRelease(const ObjectID &object_id);
 
+  Status ExperimentalMutableObjectReadRelease(const ObjectID &object_id);
+
   /// Create an object in the Plasma Store. Any metadata for this object must be
   /// be passed in when the object is created.
   ///
@@ -302,8 +304,6 @@ class PlasmaClient : public PlasmaClientInterface {
              std::vector<ObjectBuffer> *object_buffers,
              bool is_from_worker);
 
-  Status ExperimentalMutableObjectReadRelease(const ObjectID &object_id);
-
   /// Tell Plasma that the client no longer needs the object. This should be
   /// called after Get() or Create() when the client is done with the object.
   /// After this call, the buffer returned by Get() is no longer valid.
diff --git a/src/ray/object_manager/test/object_buffer_pool_test.cc b/src/ray/object_manager/test/object_buffer_pool_test.cc
index c3568d65324e5..249f8bda3b3a8 100644
--- a/src/ray/object_manager/test/object_buffer_pool_test.cc
+++ b/src/ray/object_manager/test/object_buffer_pool_test.cc
@@ -51,6 +51,20 @@ class MockPlasmaClient : public plasma::PlasmaClientInterface {
 
   MOCK_METHOD1(Abort, ray::Status(const ObjectID &object_id));
 
+  MOCK_METHOD6(ExperimentalMutableObjectWriteAcquire,
+               ray::Status(const ObjectID &object_id,
+                           int64_t data_size,
+                           const uint8_t *metadata,
+                           int64_t metadata_size,
+                           int64_t num_readers,
+                           std::shared_ptr<Buffer> *data));
+
+  MOCK_METHOD1(ExperimentalMutableObjectWriteRelease,
+               ray::Status(const ObjectID &object_id));
+
+  MOCK_METHOD1(ExperimentalMutableObjectReadRelease,
+               ray::Status(const ObjectID &object_id));
+
   ray::Status CreateAndSpillIfNeeded(const ObjectID &object_id,
                                      const ray::rpc::Address &owner_address,
                                      bool is_experimental_mutable_object,

From 494cb537d0a86e58b8b83bf5f89eaa1b2c41c642 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Thu, 7 Dec 2023 09:48:32 -0800
Subject: [PATCH 41/66] Revert "tmp"

This reverts commit 93968107d6de00336ca5af23a3c95e9f655ddc89.
---
 python/ray/_raylet.pyx             | 7 -------
 python/ray/experimental/channel.py | 4 ----
 src/ray/core_worker/core_worker.cc | 4 ----
 3 files changed, 15 deletions(-)

diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index 4d823e36bf0fe..595986851ed7c 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -3507,13 +3507,6 @@ cdef class CoreWorker:
                          c_object_id,
                          ))
 
-    def experimental_mutable_object_get_current_output_ref(self, ObjectRef channel_ref):
-        cdef:
-            CObjectID c_channel_ref = channel_ref
-        with nogil:
-            return (CCoreWorkerProcess.GetCoreWorker()
-                         .ExperimentalMutableObjectGetCurrentOutputRef(c_channel_ref))
-
     def experimental_mutable_object_read_release(self, object_refs):
         """
         For experimental.channel.Channel.
diff --git a/python/ray/experimental/channel.py b/python/ray/experimental/channel.py
index 11f5e36053822..e8ef9ad085f79 100644
--- a/python/ray/experimental/channel.py
+++ b/python/ray/experimental/channel.py
@@ -120,10 +120,6 @@ def write(self, value: Any, num_readers: Optional[int] = None):
             num_readers,
         )
 
-    def get_current_output_ref(self) -> "ray.ObjectRef":
-        return self._worker.core_worker.experimental_mutable_object_get_current_output_ref(
-                self._base_ref)
-
     def begin_read(self) -> Any:
         """
         Read the latest value from the channel. This call will block until a
diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc
index 1ac8796e94057..ce1ebe57de8e5 100644
--- a/src/ray/core_worker/core_worker.cc
+++ b/src/ray/core_worker/core_worker.cc
@@ -1307,10 +1307,6 @@ Status CoreWorker::CreateOwnedAndIncrementLocalRef(
           memory_store_->Put(RayObject(rpc::ErrorType::OBJECT_IN_PLASMA), *object_id));
     }
   }
-
-  if (is_experimental_mutable_object) {
-    RegisterChannel(*object_id);
-  }
   return Status::OK();
 }
 

From 05b002fe910b152e744c07f6350a856936653f9c Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Thu, 7 Dec 2023 09:49:38 -0800
Subject: [PATCH 42/66] cleanup

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/compiled_dag_node.py          | 49 ++++++++++++++------
 python/ray/dag/dag_node.py                   | 31 ++++---------
 python/ray/dag/tests/test_accelerated_dag.py |  9 ++--
 3 files changed, 49 insertions(+), 40 deletions(-)

diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
index 74ce83c91a917..a85c8b695a006 100644
--- a/python/ray/dag/compiled_dag_node.py
+++ b/python/ray/dag/compiled_dag_node.py
@@ -81,17 +81,17 @@ def __init__(self):
         self.node_idx_to_output_channels = {}
 
         # Cached.
-        self.dag_input_ref = None
+        self.dag_input_channel = None
         self.dag_output_channels = None
         self.worker_task_refs = []
 
-    def add_node(self, node):
+    def _add_node(self, node):
         idx = self.counter
         self.idx_to_task[idx] = CompiledTask(idx, node)
         self.dag_node_to_idx[node] = idx
         self.counter += 1
 
-    def preprocess(self):
+    def _preprocess(self):
         from ray.dag import DAGNode, InputNode
 
         for idx, task in self.idx_to_task.items():
@@ -116,13 +116,13 @@ def preprocess(self):
                 )
                 self.output_task_idx = idx
 
-    def compiled(self):
+    def _compiled(self):
         from ray.dag import DAGNode, InputNode, OutputNode, ClassMethodNode
 
-        if self.dag_input_ref is not None and self.dag_output_channels is not None:
+        if self.dag_input_channel is not None and self.dag_output_channels is not None:
             # Driver should ray.put on input, ray.get/release on output
             return (
-                self.dag_input_ref,
+                self.dag_input_channel,
                 self.dag_output_channels,
             )
 
@@ -177,7 +177,7 @@ def compiled(self):
                 assert arg_buffer is not None
                 resolved_args.append(arg_buffer)
 
-            # TODO: Assign the task with the correct input and output buffers.
+            # Assign the task with the correct input and output buffers.
             worker_fn = task.dag_node._get_remote_method("__ray_call__")
             self.worker_task_refs.append(
                 worker_fn.remote(
@@ -187,7 +187,7 @@ def compiled(self):
                 )
             )
 
-        self.dag_input_ref = self.idx_to_task[self.input_task_idx].output_channel
+        self.dag_input_channel = self.idx_to_task[self.input_task_idx].output_channel
 
         self.dag_output_channels = []
         for output in self.idx_to_task[self.output_task_idx].args:
@@ -195,19 +195,42 @@ def compiled(self):
             output_idx = self.dag_node_to_idx[output]
             self.dag_output_channels.append(self.idx_to_task[output_idx].output_channel)
 
-        assert self.dag_input_ref
+        assert self.dag_input_channel
         assert self.dag_output_channels
         # Driver should ray.put on input, ray.get/release on output
-        return (self.dag_input_ref, self.dag_output_channels)
+        return (self.dag_input_channel, self.dag_output_channels)
 
+    def execute(
+        self,
+        *args,
+        **kwargs,
+    ) -> List["ray.experimental.channel.Channel"]:
+        """Execute this DAG using the compiled execution path.
 
-def build_compiled_dag(dag: "ray.dag.DAGNode"):
+        Args:
+            args: Args to the InputNode.
+            kwargs: Kwargs to the InputNode. Not supported yet.
+
+        Returns:
+            A list of Channels that can be used to read the DAG result.
+        """
+        if len(args) != 1:
+            raise NotImplementedError("Compiled DAGs support exactly one InputNode arg")
+        if len(kwargs) != 0:
+            raise NotImplementedError("Compiled DAGs do not support kwargs")
+
+        input_channel, output_channels = self._compiled()
+        input_channel.write(args[0])
+        return output_channels
+
+
+def build_compiled_dag_from_ray_dag(dag: "ray.dag.DAGNode"):
     compiled_dag = CompiledDAG()
 
     def _build_compiled_dag(node):
-        compiled_dag.add_node(node)
+        compiled_dag._add_node(node)
         return node
 
     dag.apply_recursive(_build_compiled_dag)
-    compiled_dag.preprocess()
+    compiled_dag._preprocess()
     return compiled_dag
diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py
index 8d8e219f26c50..64a3fd1f1db31 100644
--- a/python/ray/dag/dag_node.py
+++ b/python/ray/dag/dag_node.py
@@ -16,7 +16,7 @@
 import uuid
 import asyncio
 
-from ray.dag.compiled_dag_node import build_compiled_dag
+from ray.dag.compiled_dag_node import build_compiled_dag_from_ray_dag
 
 T = TypeVar("T")
 
@@ -107,18 +107,19 @@ async def get_object_refs_from_last_execute(self) -> Dict[str, Any]:
     def clear_cache(self):
         self.cache_from_last_execute = {}
 
-    def compiled(self) -> Tuple[ray.ObjectRef]:
+    def experimental_compile(self) -> "ray.dag.CompiledDAG":
+        """Compile an accelerated execution path for this DAG. The compiled DAG
+        is cached.
+        """
         if self._compiled_dag is None:
-            self._compiled_dag = build_compiled_dag(self)
+            self._compiled_dag = build_compiled_dag_from_ray_dag(self)
 
-        return self._compiled_dag.compiled()
+        return self._compiled_dag
 
     def execute(
         self,
         *args,
         _ray_cache_refs: bool = False,
-        _ray_cache_actors: bool = True,
-        compiled: bool = False,
         **kwargs,
     ) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]:
         """Execute this DAG using the Ray default executor _execute_impl().
@@ -130,31 +131,15 @@ def execute(
                 - Serve handles for class nodes
                 - resolved values representing user input at runtime
         """
-        if compiled:
-            assert len(args) == 1, "Compiled DAGs support exactly one InputNode arg"
-            input_ref, output_channels = self.compiled()
-            input_ref.write(args[0])
-            return output_channels
 
         def executor(node):
             return node._execute_impl(*args, **kwargs)
 
-        cache = {}
+        result = self.apply_recursive(executor)
         if _ray_cache_refs:
-            cache = self.cache_from_last_execute
-        elif _ray_cache_actors:
-            for key, ref in self.cache_from_last_execute.items():
-                if isinstance(ref, ray.actor.ActorHandle):
-                    cache[key] = ref
-        result = self.apply_recursive(executor, cache=cache)
-        if _ray_cache_refs or _ray_cache_actors:
             self.cache_from_last_execute = executor.cache
         return result
 
-    def destroy_compiled_dag(self):
-        _, _, _, monitor = self.compiled()
-        monitor.destroy()
-
     def _get_toplevel_child_nodes(self) -> List["DAGNode"]:
         """Return the list of nodes specified as top-level args.
 
diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py
index b07491c1f18c6..9cab0bd70a653 100644
--- a/python/ray/dag/tests/test_accelerated_dag.py
+++ b/python/ray/dag/tests/test_accelerated_dag.py
@@ -3,13 +3,12 @@
 import os
 import sys
 
-import numpy as np
 import pytest
 
 import ray
 import ray.cluster_utils
-import ray.experimental.channel as ray_channel
-from ray.dag import DAGNode, InputNode, OutputNode
+from ray.dag import InputNode, OutputNode
+from ray.tests.conftest import *  # noqa
 
 
 logger = logging.getLogger(__name__)
@@ -34,8 +33,10 @@ def test_scatter_gather_dag(ray_start_regular, num_actors):
         out = [a.inc.bind(i) for a in actors]
         dag = OutputNode(out)
 
+    compiled_dag = dag.experimental_compile()
+
     for i in range(3):
-        output_channels = dag.execute(1, compiled=True)
+        output_channels = compiled_dag.execute(1)
         # TODO(swang): Replace with fake ObjectRef.
         results = [chan.begin_read() for chan in output_channels]
         assert results == [init_val + i + 1] * num_actors

From 521c73b588f01a80e47a55ad3bb16d148453c60c Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Thu, 7 Dec 2023 10:07:02 -0800
Subject: [PATCH 43/66] Support no-OutputNode DAGs

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/compiled_dag_node.py          | 54 +++++++++++++++-----
 python/ray/dag/tests/test_accelerated_dag.py | 20 ++++++--
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
index a85c8b695a006..fdefb0f0257af 100644
--- a/python/ray/dag/compiled_dag_node.py
+++ b/python/ray/dag/compiled_dag_node.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Tuple, Union
 
 import ray
 import ray.experimental.channel as ray_channel
@@ -6,6 +6,8 @@
 
 MAX_BUFFER_SIZE = int(100 * 1e6)  # 100MB
 
+ChannelType = "ray.experimental.channel.Channel"
+
 
 def allocate_channel(buffer_size_bytes: int = MAX_BUFFER_SIZE, num_readers: int = 1):
     if not isinstance(buffer_size_bytes, int):
@@ -52,7 +54,7 @@ def __init__(self, idx, dag_node: "ray.dag.DAGNode"):
         self.dag_node = dag_node
 
         self.args = []
-        self.dependent_node_idxs = []
+        self.dependent_node_idxs = set()
         self.output_channel = None
 
     @property
@@ -78,6 +80,7 @@ def __init__(self):
 
         self.input_task_idx = None
         self.output_task_idx = None
+        self.has_single_output = False
         self.node_idx_to_output_channels = {}
 
         # Cached.
@@ -92,14 +95,22 @@ def _add_node(self, node):
         self.counter += 1
 
     def _preprocess(self):
-        from ray.dag import DAGNode, InputNode
+        """Before compiling, preprocess the DAG to build an index from task to
+        upstream and downstream tasks, and to set the input and output node(s)
+        of the DAG.
+        """
+        from ray.dag import DAGNode, InputNode, OutputNode
 
+        # For each task node, set its upstream and downstream task nodes.
         for idx, task in self.idx_to_task.items():
             task.args = task.dag_node.get_args()
             for arg in task.args:
                 if isinstance(arg, DAGNode):
                     arg_idx = self.dag_node_to_idx[arg]
-                    self.idx_to_task[arg_idx].dependent_node_idxs.append(idx)
+                    self.idx_to_task[arg_idx].dependent_node_idxs.add(idx)
+
+        # Find the input node to the DAG.
+        for idx, task in self.idx_to_task.items():
             if isinstance(task.dag_node, InputNode):
                 assert self.input_task_idx is None, "more than one InputNode found"
                 self.input_task_idx = idx
@@ -108,6 +119,7 @@ def _preprocess(self):
             self.input_task_idx is not None
         ), "no InputNode found, require exactly one"
 
+        # Find the (multi-)output node to the DAG.
         for idx, task in self.idx_to_task.items():
             if len(task.dependent_node_idxs) == 0:
                 assert self.output_task_idx is None, (
@@ -116,9 +128,26 @@ def _preprocess(self):
                 )
                 self.output_task_idx = idx
 
-    def _compiled(self):
+        assert self.output_task_idx is not None
+        output_node = self.idx_to_task[self.output_task_idx].dag_node
+        # Add an OutputNode to the end of the DAG if it's not already there.
+        if not isinstance(output_node, OutputNode):
+            self.has_single_output = True
+            output_node = OutputNode([output_node])
+            self._add_node(output_node)
+            self.output_task_idx = self.dag_node_to_idx[output_node]
+            # Preprocess one more time so that we have the right output node
+            # now.
+            self.input_task_idx, self.output_task_idx = None, None
+            self._preprocess()
+
+    def _compiled(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]:
+        """ """
         from ray.dag import DAGNode, InputNode, OutputNode, ClassMethodNode
 
+        if self.input_task_idx is None:
+            self._preprocess()
+
         if self.dag_input_channel is not None and self.dag_output_channels is not None:
             # Driver should ray.put on input, ray.get/release on output
             return (
@@ -154,11 +183,6 @@ def _compiled(self):
             for idx in task.dependent_node_idxs:
                 queue.append(idx)
 
-        output_node = self.idx_to_task[self.output_task_idx].dag_node
-        # TODO: Add an OutputNode to the end of the DAG if
-        # it's not already there.
-        assert isinstance(output_node, OutputNode)
-
         for node_idx, task in self.idx_to_task.items():
             if node_idx == self.input_task_idx:
                 # We don't need to assign an actual task for the input node.
@@ -197,6 +221,13 @@ def _compiled(self):
 
         assert self.dag_input_channel
         assert self.dag_output_channels
+        # If no OutputNode was specified during the DAG creation, there is only
+        # one output. Return a single output channel instead of a list of
+        # channels.
+        if self.has_single_output:
+            assert len(self.dag_output_channels) == 1
+            self.dag_output_channels = self.dag_output_channels[0]
+
         # Driver should ray.put on input, ray.get/release on output
         return (self.dag_input_channel, self.dag_output_channels)
 
@@ -204,7 +235,7 @@ def execute(
         self,
         *args,
         **kwargs,
-    ) -> List["ray.experimental.channel.Channel"]:
+    ) -> Union[ChannelType, List[ChannelType]]:
         """Execute this DAG using the compiled execution path.
 
         Args:
@@ -232,5 +263,4 @@ def _build_compiled_dag(node):
         return node
 
     dag.apply_recursive(_build_compiled_dag)
-    compiled_dag._preprocess()
     return compiled_dag
diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py
index 9cab0bd70a653..3414b5dd4183a 100644
--- a/python/ray/dag/tests/test_accelerated_dag.py
+++ b/python/ray/dag/tests/test_accelerated_dag.py
@@ -25,10 +25,24 @@ def inc(self, x):
         return self.i
 
 
+def test_single_output_dag(ray_start_regular):
+    a = Actor.remote(0)
+    with InputNode() as i:
+        dag = a.inc.bind(i)
+
+    compiled_dag = dag.experimental_compile()
+
+    for i in range(3):
+        output_channel = compiled_dag.execute(1)
+        # TODO(swang): Replace with fake ObjectRef.
+        result = output_channel.begin_read()
+        assert result == i + 1
+        output_channel.end_read()
+
+
 @pytest.mark.parametrize("num_actors", [1, 4])
 def test_scatter_gather_dag(ray_start_regular, num_actors):
-    init_val = 0
-    actors = [Actor.remote(init_val) for _ in range(num_actors)]
+    actors = [Actor.remote(0) for _ in range(num_actors)]
     with InputNode() as i:
         out = [a.inc.bind(i) for a in actors]
         dag = OutputNode(out)
@@ -39,7 +53,7 @@ def test_scatter_gather_dag(ray_start_regular, num_actors):
         output_channels = compiled_dag.execute(1)
         # TODO(swang): Replace with fake ObjectRef.
         results = [chan.begin_read() for chan in output_channels]
-        assert results == [init_val + i + 1] * num_actors
+        assert results == [i + 1] * num_actors
         for chan in output_channels:
             chan.end_read()
 

From 5b58250704f73adc5dd8b0f80a336d29c1decd91 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Thu, 7 Dec 2023 10:16:41 -0800
Subject: [PATCH 44/66] Support non-DAG args

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/compiled_dag_node.py          | 42 ++++++++++++--------
 python/ray/dag/tests/test_accelerated_dag.py | 21 ++++++++++
 2 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
index fdefb0f0257af..b94366b760bd8 100644
--- a/python/ray/dag/compiled_dag_node.py
+++ b/python/ray/dag/compiled_dag_node.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple, Union
+from typing import Any, List, Tuple, Union
 
 import ray
 import ray.experimental.channel as ray_channel
@@ -27,18 +27,30 @@ def do_allocate_channel(
 
 def do_exec_compiled_task(
     self,
-    input_channels: List["ray_channel.Channel"],
+    inputs: List[Union[Any, "ray_channel.Channel"]],
     actor_method_name: str,
 ):
     try:
-        self._input_channels = input_channels
         method = getattr(self, actor_method_name)
+
+        resolved_inputs = []
+        input_channel_idxs = []
+        # Add placeholders for input channels.
+        for inp in inputs:
+            if isinstance(inp, ray_channel.Channel):
+                input_channel_idxs.append((len(resolved_inputs), inp))
+                resolved_inputs.append(None)
+            else:
+                resolved_inputs.append(inp)
+
         while True:
-            inputs = [chan.begin_read() for chan in input_channels]
-            output_val = method(*inputs)
+            for idx, chan in input_channel_idxs:
+                resolved_inputs[idx] = chan.begin_read()
+
+            output_val = method(*resolved_inputs)
 
             self._output_channel.write(output_val)
-            for chan in input_channels:
+            for _, chan in input_channel_idxs:
                 chan.end_read()
 
     except Exception as e:
@@ -122,10 +134,7 @@ def _preprocess(self):
         # Find the (multi-)output node to the DAG.
         for idx, task in self.idx_to_task.items():
             if len(task.dependent_node_idxs) == 0:
-                assert self.output_task_idx is None, (
-                    "More than one output node found, "
-                    "make sure only one node has 0 dependent tasks"
-                )
+                assert self.output_task_idx is None, "More than one output node found"
                 self.output_task_idx = idx
 
         assert self.output_task_idx is not None
@@ -194,12 +203,13 @@ def _compiled(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]
 
             resolved_args = []
             for arg in task.args:
-                # TODO(swang): Support non-ObjectRef args.
-                assert isinstance(arg, DAGNode)
-                arg_idx = self.dag_node_to_idx[arg]
-                arg_buffer = self.idx_to_task[arg_idx].output_channel
-                assert arg_buffer is not None
-                resolved_args.append(arg_buffer)
+                if isinstance(arg, DAGNode):
+                    arg_idx = self.dag_node_to_idx[arg]
+                    arg_channel = self.idx_to_task[arg_idx].output_channel
+                    assert arg_channel is not None
+                    resolved_args.append(arg_channel)
+                else:
+                    resolved_args.append(arg)
 
             # Assign the task with the correct input and output buffers.
             worker_fn = task.dag_node._get_remote_method("__ray_call__")
diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py
index 3414b5dd4183a..dabcf57f62973 100644
--- a/python/ray/dag/tests/test_accelerated_dag.py
+++ b/python/ray/dag/tests/test_accelerated_dag.py
@@ -24,6 +24,11 @@ def inc(self, x):
         self.i += x
         return self.i
 
+    def inc_two(self, x, y):
+        self.i += x
+        self.i += y
+        return self.i
+
 
 def test_single_output_dag(ray_start_regular):
     a = Actor.remote(0)
@@ -40,6 +45,22 @@ def test_single_output_dag(ray_start_regular):
         output_channel.end_read()
 
 
+def test_regular_args(ray_start_regular):
+    # Test passing regular args to .bind in addition to DAGNode args.
+    a = Actor.remote(0)
+    with InputNode() as i:
+        dag = a.inc_two.bind(2, i)
+
+    compiled_dag = dag.experimental_compile()
+
+    for i in range(3):
+        output_channel = compiled_dag.execute(1)
+        # TODO(swang): Replace with fake ObjectRef.
+        result = output_channel.begin_read()
+        assert result == (i + 1) * 3
+        output_channel.end_read()
+
+
 @pytest.mark.parametrize("num_actors", [1, 4])
 def test_scatter_gather_dag(ray_start_regular, num_actors):
     actors = [Actor.remote(0) for _ in range(num_actors)]

From b5beca4e73df7b3503efb59ea7f1adb770a4bbde Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Thu, 7 Dec 2023 10:46:54 -0800
Subject: [PATCH 45/66] errors

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/compiled_dag_node.py          | 64 +++++++++++++++++---
 python/ray/dag/tests/test_accelerated_dag.py | 46 ++++++++++++++
 2 files changed, 100 insertions(+), 10 deletions(-)

diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
index b94366b760bd8..3414dceca17c6 100644
--- a/python/ray/dag/compiled_dag_node.py
+++ b/python/ray/dag/compiled_dag_node.py
@@ -90,14 +90,16 @@ def __init__(self):
         # idx counter.
         self.counter = 0
 
+        # Attributes that are set during preprocessing.
+        # Preprocessing identifies the input node and output node.
         self.input_task_idx = None
         self.output_task_idx = None
         self.has_single_output = False
-        self.node_idx_to_output_channels = {}
 
-        # Cached.
+        # Cached attributes that are set during compilation.
         self.dag_input_channel = None
         self.dag_output_channels = None
+        self.node_idx_to_output_channels = {}
         self.worker_task_refs = []
 
     def _add_node(self, node):
@@ -111,10 +113,38 @@ def _preprocess(self):
         upstream and downstream tasks, and to set the input and output node(s)
         of the DAG.
         """
-        from ray.dag import DAGNode, InputNode, OutputNode
+        from ray.dag import (
+            DAGNode,
+            ClassMethodNode,
+            FunctionNode,
+            InputAttributeNode,
+            InputNode,
+            OutputNode,
+        )
 
         # For each task node, set its upstream and downstream task nodes.
         for idx, task in self.idx_to_task.items():
+            dag_node = task.dag_node
+            if not (
+                isinstance(dag_node, InputNode)
+                or isinstance(dag_node, OutputNode)
+                or isinstance(dag_node, ClassMethodNode)
+            ):
+                if isinstance(dag_node, InputAttributeNode):
+                    # TODO(swang): Support multi args.
+                    raise ValueError(
+                        "Compiled DAGs currently do not support kwargs or multiple args for InputNode"
+                    )
+                elif isinstance(dag_node, FunctionNode):
+                    # TODO(swang): Support non-actor tasks.
+                    raise ValueError(
+                        "Compiled DAGs currently only support actor method nodes"
+                    )
+                else:
+                    raise ValueError(
+                        f"Found unsupported node of type {type(task.dag_node)}"
+                    )
+
             task.args = task.dag_node.get_args()
             for arg in task.args:
                 if isinstance(arg, DAGNode):
@@ -127,9 +157,8 @@ def _preprocess(self):
                 assert self.input_task_idx is None, "more than one InputNode found"
                 self.input_task_idx = idx
         # TODO: Support no-input DAGs (use an empty object to signal).
-        assert (
-            self.input_task_idx is not None
-        ), "no InputNode found, require exactly one"
+        if self.input_task_idx is None:
+            raise ValueError("Compiled DAGs currently require exactly one InputNode")
 
         # Find the (multi-)output node to the DAG.
         for idx, task in self.idx_to_task.items():
@@ -150,7 +179,7 @@ def _preprocess(self):
             self.input_task_idx, self.output_task_idx = None, None
             self._preprocess()
 
-    def _compiled(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]:
+    def _compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]:
         """ """
         from ray.dag import DAGNode, InputNode, OutputNode, ClassMethodNode
 
@@ -202,14 +231,23 @@ def _compiled(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]
                 continue
 
             resolved_args = []
+            has_at_least_one_channel_input = False
             for arg in task.args:
                 if isinstance(arg, DAGNode):
                     arg_idx = self.dag_node_to_idx[arg]
                     arg_channel = self.idx_to_task[arg_idx].output_channel
                     assert arg_channel is not None
                     resolved_args.append(arg_channel)
+                    has_at_least_one_channel_input = True
                 else:
                     resolved_args.append(arg)
+            # TODO: Support no-input DAGs (use an empty object to signal).
+            if not has_at_least_one_channel_input:
+                raise ValueError(
+                    "Compiled DAGs require each task to take a "
+                    "ray.dag.InputNode or at least one other DAGNode as an "
+                    "input"
+                )
 
             # Assign the task with the correct input and output buffers.
             worker_fn = task.dag_node._get_remote_method("__ray_call__")
@@ -231,6 +269,9 @@ def _compiled(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]
 
         assert self.dag_input_channel
         assert self.dag_output_channels
+        assert [
+            output_channel is not None for output_channel in self.dag_output_channels
+        ]
         # If no OutputNode was specified during the DAG creation, there is only
         # one output. Return a single output channel instead of a list of
         # channels.
@@ -255,12 +296,14 @@ def execute(
         Returns:
             A list of Channels that can be used to read the DAG result.
         """
+        # These errors should already be caught during compilation, but just in
+        # case.
         if len(args) != 1:
-            raise NotImplementedError("Compiled DAGs support exactly one InputNode arg")
+            raise ValueError("Compiled DAGs support exactly one InputNode arg")
         if len(kwargs) != 0:
-            raise NotImplementedError("Compiled DAGs do not support kwargs")
+            raise ValueError("Compiled DAGs do not support kwargs")
 
-        input_channel, output_channels = self._compiled()
+        input_channel, output_channels = self._compile()
         input_channel.write(args[0])
         return output_channels
 
@@ -273,4 +316,5 @@ def _build_compiled_dag(node):
         return node
 
     dag.apply_recursive(_build_compiled_dag)
+    compiled_dag._compile()
     return compiled_dag
diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py
index dabcf57f62973..405cd2876e84c 100644
--- a/python/ray/dag/tests/test_accelerated_dag.py
+++ b/python/ray/dag/tests/test_accelerated_dag.py
@@ -79,6 +79,52 @@ def test_scatter_gather_dag(ray_start_regular, num_actors):
             chan.end_read()
 
 
+def test_dag_errors(ray_start_regular):
+    a = Actor.remote(0)
+    dag = a.inc.bind(1)
+    with pytest.raises(
+        ValueError, match="Compiled DAGs currently require exactly one InputNode"
+    ):
+        dag.experimental_compile()
+
+    a2 = Actor.remote(0)
+    with InputNode() as inp:
+        dag = OutputNode([a.inc.bind(inp), a2.inc.bind(1)])
+    with pytest.raises(
+        ValueError,
+        match="Compiled DAGs require each task to take a ray.dag.InputNode or "
+        "at least one other DAGNode as an input",
+    ):
+        dag.experimental_compile()
+
+    @ray.remote
+    def f(x):
+        return x
+
+    with InputNode() as inp:
+        dag = f.bind(inp)
+    with pytest.raises(
+        ValueError, match="Compiled DAGs currently only support actor method nodes"
+    ):
+        dag.experimental_compile()
+
+    with InputNode() as inp:
+        dag = a.inc_two.bind(inp[0], inp[1])
+    with pytest.raises(
+        ValueError,
+        match="Compiled DAGs currently do not support kwargs or multiple args for InputNode",
+    ):
+        dag.experimental_compile()
+
+    with InputNode() as inp:
+        dag = a.inc_two.bind(inp.x, inp.y)
+    with pytest.raises(
+        ValueError,
+        match="Compiled DAGs currently do not support kwargs or multiple args for InputNode",
+    ):
+        dag.experimental_compile()
+
+
 if __name__ == "__main__":
     if os.environ.get("PARALLEL_CI"):
         sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))

From cc2e795c6cf037499e503f2f7ed36634ab619bbe Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Thu, 7 Dec 2023 10:47:39 -0800
Subject: [PATCH 46/66] lint

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/compiled_dag_node.py          | 3 ++-
 python/ray/dag/tests/test_accelerated_dag.py | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
index 3414dceca17c6..8dc6d0db9474d 100644
--- a/python/ray/dag/compiled_dag_node.py
+++ b/python/ray/dag/compiled_dag_node.py
@@ -133,7 +133,8 @@ def _preprocess(self):
                 if isinstance(dag_node, InputAttributeNode):
                     # TODO(swang): Support multi args.
                     raise ValueError(
-                        "Compiled DAGs currently do not support kwargs or multiple args for InputNode"
+                        "Compiled DAGs currently do not support kwargs or "
+                        "multiple args for InputNode"
                     )
                 elif isinstance(dag_node, FunctionNode):
                     # TODO(swang): Support non-actor tasks.
diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py
index 405cd2876e84c..4702ee45580f1 100644
--- a/python/ray/dag/tests/test_accelerated_dag.py
+++ b/python/ray/dag/tests/test_accelerated_dag.py
@@ -112,7 +112,8 @@ def f(x):
         dag = a.inc_two.bind(inp[0], inp[1])
     with pytest.raises(
         ValueError,
-        match="Compiled DAGs currently do not support kwargs or multiple args for InputNode",
+        match="Compiled DAGs currently do not support kwargs or multiple args "
+        "for InputNode",
     ):
         dag.experimental_compile()
 
@@ -120,7 +121,8 @@ def f(x):
         dag = a.inc_two.bind(inp.x, inp.y)
     with pytest.raises(
         ValueError,
-        match="Compiled DAGs currently do not support kwargs or multiple args for InputNode",
+        match="Compiled DAGs currently do not support kwargs or multiple args "
+        "for InputNode",
     ):
         dag.experimental_compile()
 

From c17c3671e2278e993f5b5239c3bef3ba135acfde Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Thu, 7 Dec 2023 10:50:15 -0800
Subject: [PATCH 47/66] doc

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/compiled_dag_node.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
index 8dc6d0db9474d..35890b3367939 100644
--- a/python/ray/dag/compiled_dag_node.py
+++ b/python/ray/dag/compiled_dag_node.py
@@ -181,7 +181,17 @@ def _preprocess(self):
             self._preprocess()
 
     def _compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]:
-        """ """
+        """Compile an execution path. This allocates channels for adjacent
+        tasks to send/receive values. An infinite task is submitted to each
+        actor in the DAG that repeatedly receives from input channel(s) and
+        sends to output channel(s).
+
+        Returns:
+            A tuple of (input channel, output channel(s)). The input channel
+            that should be used by the caller to submit a DAG execution. The
+            output channel(s) should be read by the caller to get the DAG
+            output.
+        """
         from ray.dag import DAGNode, InputNode, OutputNode, ClassMethodNode
 
         if self.input_task_idx is None:

From 4dfa31eda5346b74d5e7b1e774eab2a193e1d148 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Thu, 7 Dec 2023 10:52:06 -0800
Subject: [PATCH 48/66] skip tests on windows

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/tests/test_channel.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/ray/tests/test_channel.py b/python/ray/tests/test_channel.py
index 0bd008593a110..3ff33df76bc5d 100644
--- a/python/ray/tests/test_channel.py
+++ b/python/ray/tests/test_channel.py
@@ -13,6 +13,7 @@
 logger = logging.getLogger(__name__)
 
 
+@pytest.mark.skipif(sys.platform == "win32", reason="Requires POSIX.")
 def test_put_local_get(ray_start_regular):
     chan = ray_channel.Channel(1000)
 
@@ -28,6 +29,7 @@ def test_put_local_get(ray_start_regular):
         chan.end_read()
 
 
+@pytest.mark.skipif(sys.platform == "win32", reason="Requires POSIX.")
 def test_errors(ray_start_regular):
     @ray.remote
     class Actor:
@@ -72,6 +74,7 @@ def read(self, chan):
     assert "ray.exceptions.RaySystemError" in str(exc_info.value)
 
 
+@pytest.mark.skipif(sys.platform == "win32", reason="Requires POSIX.")
 def test_put_different_meta(ray_start_regular):
     chan = ray_channel.Channel(1000)
 
@@ -97,6 +100,7 @@ def _test(val):
     _test(np.random.rand(1))
 
 
+@pytest.mark.skipif(sys.platform == "win32", reason="Requires POSIX.")
 @pytest.mark.parametrize("num_readers", [1, 4])
 def test_put_remote_get(ray_start_regular, num_readers):
     chan = ray_channel.Channel(1000)

From 03f4fbd1a4e4b156b384441ce39e07f87a43c69e Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Fri, 8 Dec 2023 09:24:25 -0800
Subject: [PATCH 49/66] larger CI machine

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 .buildkite/core.rayci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/core.rayci.yml b/.buildkite/core.rayci.yml
index d8ddef502e1f7..c4beb251b10c1 100644
--- a/.buildkite/core.rayci.yml
+++ b/.buildkite/core.rayci.yml
@@ -210,7 +210,7 @@ steps:
 
   - label: ":ray: core: cpp ubsan tests"
     tags: core_cpp
-    instance_type: medium
+    instance_type: large
     commands:
       - bazel run //ci/ray_ci:test_in_docker -- //:all //src/... core --build-type ubsan
         --except-tags no_ubsan

From 7dde158bffc5bbdee778837d9277692e0284d189 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Fri, 8 Dec 2023 17:35:00 -0800
Subject: [PATCH 50/66] cleanup

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/class_node.py                 |   9 +-
 python/ray/dag/compiled_dag_node.py          | 119 ++++++++++++++-----
 python/ray/dag/dag_node.py                   |   2 +-
 python/ray/dag/tests/test_accelerated_dag.py |  43 +++++++
 python/ray/experimental/channel.py           |  33 +++--
 5 files changed, 159 insertions(+), 47 deletions(-)

diff --git a/python/ray/dag/class_node.py b/python/ray/dag/class_node.py
index 8daf406dd2e1c..9e0e1c3596a22 100644
--- a/python/ray/dag/class_node.py
+++ b/python/ray/dag/class_node.py
@@ -7,7 +7,7 @@
 from ray.dag.constants import PARENT_CLASS_NODE_KEY
 from ray.util.annotations import DeveloperAPI
 
-from typing import Any, Dict, List, Union, Tuple
+from typing import Any, Dict, List, Optional, Union, Tuple
 
 
 @DeveloperAPI
@@ -146,7 +146,7 @@ def __init__(
         self._method_name: str = method_name
         # Parse other_args_to_resolve and assign to variables
         self._parent_class_node: Union[
-            ClassNode, ReferenceType["ray._private.actor.ActorHandle"]
+            ClassNode, ReferenceType["ray.actor.ActorHandle"]
         ] = other_args_to_resolve.get(PARENT_CLASS_NODE_KEY)
         # The actor creation task dependency is encoded as the first argument,
         # and the ordering dependency as the second, which ensures they are
@@ -197,3 +197,8 @@ def get_method_name(self) -> str:
     def _get_remote_method(self, method_name):
         method_body = getattr(self._parent_class_node, method_name)
         return method_body
+
+    def _get_actor_handle(self) -> Optional[ReferenceType["ray.actor.ActorHandle"]]:
+        if not isinstance(self._parent_class_node, ray.actor.ActorHandle):
+            return None
+        return self._parent_class_node
diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
index 35890b3367939..17f6362b6bfd7 100644
--- a/python/ray/dag/compiled_dag_node.py
+++ b/python/ray/dag/compiled_dag_node.py
@@ -1,5 +1,8 @@
+import logging
 from typing import Any, List, Tuple, Union
 
+from collections import defaultdict
+
 import ray
 import ray.experimental.channel as ray_channel
 
@@ -8,70 +11,93 @@
 
 ChannelType = "ray.experimental.channel.Channel"
 
-
-def allocate_channel(buffer_size_bytes: int = MAX_BUFFER_SIZE, num_readers: int = 1):
-    if not isinstance(buffer_size_bytes, int):
-        raise ValueError("buffer_size_bytes must be an integer")
-    if not isinstance(num_readers, int):
-        raise ValueError("num_readers must be an integer")
-
-    return ray_channel.Channel(buffer_size_bytes, num_readers)
+logger = logging.getLogger(__name__)
 
 
 def do_allocate_channel(
     self, buffer_size_bytes: int = MAX_BUFFER_SIZE, num_readers: int = 1
-):
-    self._output_channel = allocate_channel(buffer_size_bytes)
+) -> ChannelType:
+    """Generic actor method to allocate an output channel.
+
+    Args:
+        buffer_size_bytes: The maximum size of messages in the channel.
+        num_readers: The number of readers per message.
+
+    Returns:
+        The allocated channel.
+    """
+    self._output_channel = ray_channel.Channel(buffer_size_bytes, num_readers)
     return self._output_channel
 
 
 def do_exec_compiled_task(
     self,
-    inputs: List[Union[Any, "ray_channel.Channel"]],
+    inputs: List[Union[Any, ChannelType]],
     actor_method_name: str,
-):
+) -> None:
+    """Generic actor method to begin executing a compiled DAG. This runs an
+    infinite loop to repeatedly read input channel(s), execute the given
+    method, and write output channel(s). It only exits if the actor dies or an
+    exception is thrown.
+
+    Args:
+        inputs: The arguments to the task. Arguments that are not Channels will
+            get passed through to the actor method. If the argument is a channel,
+            it will be replaced by the value read from the channel before the
+            method execute.
+        actor_method_name: The name of the actual actor method to execute in
+            the loop.
+    """
     try:
         method = getattr(self, actor_method_name)
 
         resolved_inputs = []
         input_channel_idxs = []
         # Add placeholders for input channels.
-        for inp in inputs:
+        for idx, inp in enumerate(inputs):
             if isinstance(inp, ray_channel.Channel):
-                input_channel_idxs.append((len(resolved_inputs), inp))
+                input_channel_idxs.append((idx, inp))
                 resolved_inputs.append(None)
             else:
                 resolved_inputs.append(inp)
 
         while True:
-            for idx, chan in input_channel_idxs:
-                resolved_inputs[idx] = chan.begin_read()
+            for idx, channel in input_channel_idxs:
+                resolved_inputs[idx] = channel.begin_read()
 
             output_val = method(*resolved_inputs)
 
             self._output_channel.write(output_val)
-            for _, chan in input_channel_idxs:
-                chan.end_read()
+            for _, channel in input_channel_idxs:
+                channel.end_read()
 
     except Exception as e:
-        print("Task aborted", e)
+        logging.warn(f"Compiled DAG task aborted with exception: {e}")
         raise
 
 
 class CompiledTask:
     """Wraps the normal Ray DAGNode with some metadata."""
 
-    def __init__(self, idx, dag_node: "ray.dag.DAGNode"):
+    def __init__(self, idx: int, dag_node: "ray.dag.DAGNode"):
+        """
+        Args:
+            idx: A unique index into the original DAG.
+            dag_node: The original DAG node created by the user.
+        """
         self.idx = idx
         self.dag_node = dag_node
 
-        self.args = []
-        self.dependent_node_idxs = set()
+        self.downstream_node_idxs = set()
         self.output_channel = None
 
+    @property
+    def args(self):
+        return self.dag_node.get_args()
+
     @property
     def num_readers(self):
-        return len(self.dependent_node_idxs)
+        return len(self.downstream_node_idxs)
 
     def __str__(self):
         return f"""
@@ -95,23 +121,27 @@ def __init__(self):
         self.input_task_idx = None
         self.output_task_idx = None
         self.has_single_output = False
+        self.actor_task_count = defaultdict(int)
 
         # Cached attributes that are set during compilation.
         self.dag_input_channel = None
         self.dag_output_channels = None
-        self.node_idx_to_output_channels = {}
+        # ObjectRef for each worker's task. The task is an infinite loop that
+        # repeatedly executes the method specified in the DAG.
         self.worker_task_refs = []
 
-    def _add_node(self, node):
+    def _add_node(self, node: "ray.dag.DAGNode") -> None:
         idx = self.counter
         self.idx_to_task[idx] = CompiledTask(idx, node)
         self.dag_node_to_idx[node] = idx
         self.counter += 1
 
-    def _preprocess(self):
+    def _preprocess(self) -> None:
         """Before compiling, preprocess the DAG to build an index from task to
         upstream and downstream tasks, and to set the input and output node(s)
         of the DAG.
+
+        This function is idempotent.
         """
         from ray.dag import (
             DAGNode,
@@ -122,6 +152,9 @@ def _preprocess(self):
             OutputNode,
         )
 
+        self.input_task_idx, self.output_task_idx = None, None
+        self.actor_task_count.clear()
+
         # For each task node, set its upstream and downstream task nodes.
         for idx, task in self.idx_to_task.items():
             dag_node = task.dag_node
@@ -146,11 +179,26 @@ def _preprocess(self):
                         f"Found unsupported node of type {type(task.dag_node)}"
                     )
 
-            task.args = task.dag_node.get_args()
+            if isinstance(dag_node, ClassMethodNode):
+                actor_handle = dag_node._get_actor_handle()
+                if actor_handle is None:
+                    raise ValueError(
+                        "Compiled DAGs can only bind methods to an actor "
+                        "that is already created with Actor.remote()"
+                    )
+                self.actor_task_count[actor_handle._actor_id] += 1
+
             for arg in task.args:
                 if isinstance(arg, DAGNode):
                     arg_idx = self.dag_node_to_idx[arg]
-                    self.idx_to_task[arg_idx].dependent_node_idxs.add(idx)
+                    self.idx_to_task[arg_idx].downstream_node_idxs.add(idx)
+
+        for actor_id, task_count in self.actor_task_count.items():
+            if task_count > 1:
+                raise ValueError(
+                    "Compiled DAGs can contain at most one task per actor handle. "
+                    f"Actor with ID {actor_id} appears {task_count}x."
+                )
 
         # Find the input node to the DAG.
         for idx, task in self.idx_to_task.items():
@@ -163,7 +211,7 @@ def _preprocess(self):
 
         # Find the (multi-)output node to the DAG.
         for idx, task in self.idx_to_task.items():
-            if len(task.dependent_node_idxs) == 0:
+            if len(task.downstream_node_idxs) == 0:
                 assert self.output_task_idx is None, "More than one output node found"
                 self.output_task_idx = idx
 
@@ -177,7 +225,6 @@ def _preprocess(self):
             self.output_task_idx = self.dag_node_to_idx[output_node]
             # Preprocess one more time so that we have the right output node
             # now.
-            self.input_task_idx, self.output_task_idx = None, None
             self._preprocess()
 
     def _compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]:
@@ -186,6 +233,9 @@ def _compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]:
         actor in the DAG that repeatedly receives from input channel(s) and
         sends to output channel(s).
 
+        This function is idempotent and will cache the previously allocated
+        channels.
+
         Returns:
             A tuple of (input channel, output channel(s)). The input channel
             that should be used by the caller to submit a DAG execution. The
@@ -197,7 +247,8 @@ def _compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]:
         if self.input_task_idx is None:
             self._preprocess()
 
-        if self.dag_input_channel is not None and self.dag_output_channels is not None:
+        if self.dag_input_channel is not None:
+            assert self.dag_output_channels is not None
             # Driver should ray.put on input, ray.get/release on output
             return (
                 self.dag_input_channel,
@@ -225,11 +276,13 @@ def _compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]:
                     )
                 )
             elif isinstance(task.dag_node, InputNode):
-                task.output_channel = allocate_channel(num_readers=task.num_readers)
+                task.output_channel = ray_channel.Channel(
+                    buffer_size_bytes=MAX_BUFFER_SIZE, num_readers=task.num_readers
+                )
             else:
                 assert isinstance(task.dag_node, OutputNode)
 
-            for idx in task.dependent_node_idxs:
+            for idx in task.downstream_node_idxs:
                 queue.append(idx)
 
         for node_idx, task in self.idx_to_task.items():
diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py
index 64a3fd1f1db31..fa029457fa73c 100644
--- a/python/ray/dag/dag_node.py
+++ b/python/ray/dag/dag_node.py
@@ -234,7 +234,7 @@ def _apply_and_replace_all_child_nodes(
             new_args, new_kwargs, self.get_options(), new_other_args_to_resolve
         )
 
-    def apply_recursive(self, fn: "Callable[[DAGNode], T]", cache=None) -> T:
+    def apply_recursive(self, fn: "Callable[[DAGNode], T]") -> T:
         """Apply callable on each node in this DAG in a bottom-up tree walk.
 
         Args:
diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py
index 4702ee45580f1..49aecd182be7f 100644
--- a/python/ray/dag/tests/test_accelerated_dag.py
+++ b/python/ray/dag/tests/test_accelerated_dag.py
@@ -9,6 +9,7 @@
 import ray.cluster_utils
 from ray.dag import InputNode, OutputNode
 from ray.tests.conftest import *  # noqa
+from ray._private.test_utils import wait_for_condition
 
 
 logger = logging.getLogger(__name__)
@@ -24,6 +25,10 @@ def inc(self, x):
         self.i += x
         return self.i
 
+    def append_to(self, lst):
+        lst.append(self.i)
+        return lst
+
     def inc_two(self, x, y):
         self.i += x
         self.i += y
@@ -79,6 +84,36 @@ def test_scatter_gather_dag(ray_start_regular, num_actors):
             chan.end_read()
 
 
+@pytest.mark.parametrize("num_actors", [1, 4])
+def test_chain_dag(ray_start_regular, num_actors):
+    actors = [Actor.remote(i) for i in range(num_actors)]
+    with InputNode() as inp:
+        dag = inp
+        for a in actors:
+            dag = a.append_to.bind(dag)
+
+    compiled_dag = dag.experimental_compile()
+
+    for i in range(3):
+        output_channel = compiled_dag.execute([])
+        # TODO(swang): Replace with fake ObjectRef.
+        result = output_channel.begin_read()
+        assert result == list(range(num_actors))
+        output_channel.end_read()
+
+
+def test_dag_exception(ray_start_regular, capsys):
+    a = Actor.remote(0)
+    with InputNode() as inp:
+        dag = a.inc.bind(inp)
+
+    compiled_dag = dag.experimental_compile()
+    output_channel = compiled_dag.execute("hello")
+    wait_for_condition(
+        lambda: "Compiled DAG task aborted with exception" in capsys.readouterr().err
+    )
+
+
 def test_dag_errors(ray_start_regular):
     a = Actor.remote(0)
     dag = a.inc.bind(1)
@@ -108,6 +143,14 @@ def f(x):
     ):
         dag.experimental_compile()
 
+    with InputNode() as inp:
+        dag = a.inc.bind(inp)
+        dag = a.inc.bind(dag)
+    with pytest.raises(
+        ValueError, match="Compiled DAGs can contain at most one task per actor handle."
+    ):
+        dag.experimental_compile()
+
     with InputNode() as inp:
         dag = a.inc_two.bind(inp[0], inp[1])
     with pytest.raises(
diff --git a/python/ray/experimental/channel.py b/python/ray/experimental/channel.py
index e8ef9ad085f79..310fa870c0fd1 100644
--- a/python/ray/experimental/channel.py
+++ b/python/ray/experimental/channel.py
@@ -12,7 +12,7 @@
 
 
 def _create_channel_ref(
-    buffer_size: int,
+    buffer_size_bytes: int,
 ) -> "ray.ObjectRef":
     """
     Create a channel that can be read and written by co-located Ray processes.
@@ -21,7 +21,7 @@ def _create_channel_ref(
     read the previous value. Only the channel creator may write to the channel.
 
     Args:
-        buffer_size: The number of bytes to allocate for the object data and
+        buffer_size_bytes: The number of bytes to allocate for the object data and
             metadata. Writes to the channel must produce serialized data and
             metadata less than or equal to this value.
     Returns:
@@ -30,7 +30,7 @@ def _create_channel_ref(
     worker = ray._private.worker.global_worker
     worker.check_connected()
 
-    value = b"0" * buffer_size
+    value = b"0" * buffer_size_bytes
 
     try:
         object_ref = worker.put_object(
@@ -52,7 +52,12 @@ class Channel:
     ray.wait.
     """
 
-    def __init__(self, buffer_size: Optional[int] = None, num_readers: int = 1):
+    def __init__(
+        self,
+        buffer_size_bytes: Optional[int] = None,
+        num_readers: int = 1,
+        _base_ref: Optional["ray.ObjectRef"] = None,
+    ):
         """
         Create a channel that can be read and written by co-located Ray processes.
 
@@ -60,26 +65,32 @@ def __init__(self, buffer_size: Optional[int] = None, num_readers: int = 1):
         so the writer will block until reader(s) have read the previous value.
 
         Args:
-            buffer_size: The number of bytes to allocate for the object data and
+            buffer_size_bytes: The number of bytes to allocate for the object data and
                 metadata. Writes to the channel must produce serialized data and
                 metadata less than or equal to this value.
         Returns:
             Channel: A wrapper around ray.ObjectRef.
         """
-        if buffer_size is None:
-            self._base_ref = None
+        if buffer_size_bytes is None:
+            if _base_ref is None:
+                raise ValueError(
+                    "One of `buffer_size_bytes` or `_base_ref` must be provided"
+                )
+            self._base_ref = _base_ref
         else:
-            self._base_ref = _create_channel_ref(buffer_size)
+            if not isinstance(buffer_size_bytes, int):
+                raise ValueError("buffer_size_bytes must be an integer")
+            self._base_ref = _create_channel_ref(buffer_size_bytes)
 
+        if not isinstance(num_readers, int):
+            raise ValueError("num_readers must be an integer")
         self._num_readers = num_readers
         self._worker = ray._private.worker.global_worker
         self._worker.check_connected()
 
     @staticmethod
     def _from_base_ref(base_ref: "ray.ObjectRef", num_readers: int) -> "Channel":
-        chan = Channel(num_readers=num_readers)
-        chan._base_ref = base_ref
-        return chan
+        return Channel(num_readers=num_readers, _base_ref=base_ref)
 
     def __reduce__(self):
         return self._from_base_ref, (self._base_ref, self._num_readers)

From 63cc16d55f2b37ba4e13f67a68d85bcbf7f2c1d2 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Fri, 8 Dec 2023 17:35:11 -0800
Subject: [PATCH 51/66] cleanup

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/tests/test_accelerated_dag.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py
index 49aecd182be7f..4435235b52d05 100644
--- a/python/ray/dag/tests/test_accelerated_dag.py
+++ b/python/ray/dag/tests/test_accelerated_dag.py
@@ -108,7 +108,7 @@ def test_dag_exception(ray_start_regular, capsys):
         dag = a.inc.bind(inp)
 
     compiled_dag = dag.experimental_compile()
-    output_channel = compiled_dag.execute("hello")
+    compiled_dag.execute("hello")
     wait_for_condition(
         lambda: "Compiled DAG task aborted with exception" in capsys.readouterr().err
     )

From dca12399b057c01a4fb25307a2b43d401ba0fc7e Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Fri, 8 Dec 2023 17:50:31 -0800
Subject: [PATCH 52/66] perf

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/_private/ray_perf.py | 46 +++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py
index 330527957d675..99401241ceb22 100644
--- a/python/ray/_private/ray_perf.py
+++ b/python/ray/_private/ray_perf.py
@@ -9,6 +9,7 @@
 import ray
 
 import ray.experimental.channel as ray_channel
+from ray.dag import InputNode, OutputNode
 
 logger = logging.getLogger(__name__)
 
@@ -369,6 +370,51 @@ def read(self, chans):
     for reader in readers:
         ray.kill(reader)
 
+    # Tests for compiled DAGs.
+
+    def _exec(dag):
+        output_channel = dag.execute(b"x")
+        output_channel.begin_read()
+        output_channel.end_read()
+
+    def _exec_multi_output(dag):
+        output_channels = dag.execute(b"x")
+        for output_channel in output_channels:
+            output_channel.begin_read()
+        for output_channel in output_channels:
+            output_channel.end_read()
+
+    @ray.remote
+    class Actor:
+        def echo(self, x):
+            return x
+
+    a = Actor.remote()
+    with InputNode() as inp:
+        dag = a.echo.bind(inp)
+
+    dag = dag.experimental_compile()
+    results += timeit("compiled single-actor DAG calls", lambda: _exec(dag))
+
+    del a
+    n_cpu = multiprocessing.cpu_count() // 2
+    actors = [Actor.remote() for _ in range(n_cpu)]
+    with InputNode() as inp:
+        dag = OutputNode([a.echo.bind(inp) for a in actors])
+    dag = dag.experimental_compile()
+    results += timeit(
+        f"compiled scatter-gather DAG calls, n={n_cpu} actors",
+        lambda: _exec_multi_output(dag),
+    )
+
+    actors = [Actor.remote() for _ in range(n_cpu)]
+    with InputNode() as inp:
+        dag = inp
+        for a in actors:
+            dag = a.echo.bind(dag)
+    dag = dag.experimental_compile()
+    results += timeit(f"compiled chain DAG calls, n={n_cpu} actors", lambda: _exec(dag))
+
     ray.shutdown()
 
     ############################

From 7b8472b70438a0d1c45b8a47817ef0afc50ec45f Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Sat, 9 Dec 2023 11:28:42 -0800
Subject: [PATCH 53/66] add normal DAG

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/_private/ray_perf.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py
index 99401241ceb22..73d1f2afb0acd 100644
--- a/python/ray/_private/ray_perf.py
+++ b/python/ray/_private/ray_perf.py
@@ -393,6 +393,7 @@ def echo(self, x):
     with InputNode() as inp:
         dag = a.echo.bind(inp)
 
+    results += timeit("single-actor DAG calls", lambda: ray.get(dag.execute(b"x")))
     dag = dag.experimental_compile()
     results += timeit("compiled single-actor DAG calls", lambda: _exec(dag))
 
@@ -401,6 +402,7 @@ def echo(self, x):
     actors = [Actor.remote() for _ in range(n_cpu)]
     with InputNode() as inp:
         dag = OutputNode([a.echo.bind(inp) for a in actors])
+    results += timeit("scatter-gather DAG calls", lambda: ray.get(dag.execute(b"x")))
     dag = dag.experimental_compile()
     results += timeit(
         f"compiled scatter-gather DAG calls, n={n_cpu} actors",
@@ -412,6 +414,9 @@ def echo(self, x):
         dag = inp
         for a in actors:
             dag = a.echo.bind(dag)
+    results += timeit(
+        f"chain DAG calls, n={n_cpu} actors", lambda: ray.get(dag.execute(b"x"))
+    )
     dag = dag.experimental_compile()
     results += timeit(f"compiled chain DAG calls, n={n_cpu} actors", lambda: _exec(dag))
 

From 740169bf761576be79bc412bed621615eea622ef Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Sat, 9 Dec 2023 11:39:05 -0800
Subject: [PATCH 54/66] x

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/_private/ray_perf.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py
index 73d1f2afb0acd..8d1a56f09cc98 100644
--- a/python/ray/_private/ray_perf.py
+++ b/python/ray/_private/ray_perf.py
@@ -402,7 +402,9 @@ def echo(self, x):
     actors = [Actor.remote() for _ in range(n_cpu)]
     with InputNode() as inp:
         dag = OutputNode([a.echo.bind(inp) for a in actors])
-    results += timeit("scatter-gather DAG calls", lambda: ray.get(dag.execute(b"x")))
+    results += timeit(
+        "scatter-gather DAG calls, n={n_cpu} actors", lambda: ray.get(dag.execute(b"x"))
+    )
     dag = dag.experimental_compile()
     results += timeit(
         f"compiled scatter-gather DAG calls, n={n_cpu} actors",

From 905a5bc4f01c62c8ededaa81abfe4318b1da0653 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 12 Dec 2023 10:29:57 -0800
Subject: [PATCH 55/66] merge

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/_private/ray_perf.py              |   4 +-
 python/ray/dag/constants.py                  |   1 +
 python/ray/dag/dag_node.py                   |   7 --
 python/ray/dag/tests/test_accelerated_dag.py |   8 +-
 python/ray/dag/tests/test_accelerator_dag.py | 122 -------------------
 5 files changed, 7 insertions(+), 135 deletions(-)
 delete mode 100644 python/ray/dag/tests/test_accelerator_dag.py

diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py
index 8d1a56f09cc98..d1c1e7e1abf6c 100644
--- a/python/ray/_private/ray_perf.py
+++ b/python/ray/_private/ray_perf.py
@@ -9,7 +9,7 @@
 import ray
 
 import ray.experimental.channel as ray_channel
-from ray.dag import InputNode, OutputNode
+from ray.dag import InputNode, MultiOutputNode
 
 logger = logging.getLogger(__name__)
 
@@ -401,7 +401,7 @@ def echo(self, x):
     n_cpu = multiprocessing.cpu_count() // 2
     actors = [Actor.remote() for _ in range(n_cpu)]
     with InputNode() as inp:
-        dag = OutputNode([a.echo.bind(inp) for a in actors])
+        dag = MultiOutputNode([a.echo.bind(inp) for a in actors])
     results += timeit(
         "scatter-gather DAG calls, n={n_cpu} actors", lambda: ray.get(dag.execute(b"x"))
     )
diff --git a/python/ray/dag/constants.py b/python/ray/dag/constants.py
index 77ccb6cc35b78..d2d309d56bdaa 100644
--- a/python/ray/dag/constants.py
+++ b/python/ray/dag/constants.py
@@ -1,5 +1,6 @@
 # Reserved keys used to handle ClassMethodNode in Ray DAG building.
 PARENT_CLASS_NODE_KEY = "parent_class_node"
+PREV_CLASS_METHOD_CALL_KEY = "prev_class_method_call"
 
 # Reserved key to distinguish DAGNode type and avoid collision with user dict.
 DAGNODE_TYPE_KEY = "__dag_node_type__"
diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py
index 81dbd2b3d128f..6d15889eea8c2 100644
--- a/python/ray/dag/dag_node.py
+++ b/python/ray/dag/dag_node.py
@@ -117,14 +117,7 @@ def experimental_compile(self) -> "ray.dag.CompiledDAG":
         return self._compiled_dag
 
     def execute(
-<<<<<<< HEAD
-        self,
-        *args,
-        _ray_cache_refs: bool = False,
-        **kwargs,
-=======
         self, *args, _ray_cache_refs: bool = False, **kwargs
->>>>>>> 1a090a0f13492fbaa0561514488bf9b3638af6af
     ) -> Union[ray.ObjectRef, "ray.actor.ActorHandle"]:
         """Execute this DAG using the Ray default executor _execute_impl().
 
diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py
index 4435235b52d05..13a90ceef3b13 100644
--- a/python/ray/dag/tests/test_accelerated_dag.py
+++ b/python/ray/dag/tests/test_accelerated_dag.py
@@ -7,7 +7,7 @@
 
 import ray
 import ray.cluster_utils
-from ray.dag import InputNode, OutputNode
+from ray.dag import InputNode, MultiOutputNode
 from ray.tests.conftest import *  # noqa
 from ray._private.test_utils import wait_for_condition
 
@@ -35,7 +35,7 @@ def inc_two(self, x, y):
         return self.i
 
 
-def test_single_output_dag(ray_start_regular):
+def test_basic(ray_start_regular):
     a = Actor.remote(0)
     with InputNode() as i:
         dag = a.inc.bind(i)
@@ -71,7 +71,7 @@ def test_scatter_gather_dag(ray_start_regular, num_actors):
     actors = [Actor.remote(0) for _ in range(num_actors)]
     with InputNode() as i:
         out = [a.inc.bind(i) for a in actors]
-        dag = OutputNode(out)
+        dag = MultiOutputNode(out)
 
     compiled_dag = dag.experimental_compile()
 
@@ -124,7 +124,7 @@ def test_dag_errors(ray_start_regular):
 
     a2 = Actor.remote(0)
     with InputNode() as inp:
-        dag = OutputNode([a.inc.bind(inp), a2.inc.bind(1)])
+        dag = MultiOutputNode([a.inc.bind(inp), a2.inc.bind(1)])
     with pytest.raises(
         ValueError,
         match="Compiled DAGs require each task to take a ray.dag.InputNode or "
diff --git a/python/ray/dag/tests/test_accelerator_dag.py b/python/ray/dag/tests/test_accelerator_dag.py
deleted file mode 100644
index 7114a4f0f0ac7..0000000000000
--- a/python/ray/dag/tests/test_accelerator_dag.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import pytest
-
-import ray
-from ray.dag.input_node import InputNode
-from ray.dag.output_node import OutputNode
-
-
-def test_output_node(shared_ray_instance):
-    @ray.remote
-    def f(input):
-        return input
-
-    with pytest.raises(ValueError):
-        with InputNode() as input_data:
-            dag = OutputNode(f.bind(input_data))
-
-    with InputNode() as input_data:
-        dag = OutputNode([f.bind(input_data)])
-
-    assert ray.get(dag.execute(1)) == [1]
-    assert ray.get(dag.execute(2)) == [2]
-
-    with InputNode() as input_data:
-        dag = OutputNode([f.bind(input_data["x"]), f.bind(input_data["y"])])
-
-    refs = dag.execute({"x": 1, "y": 2})
-    assert len(refs) == 2
-    assert ray.get(refs) == [1, 2]
-
-    with InputNode() as input_data:
-        dag = OutputNode(
-            [f.bind(input_data["x"]), f.bind(input_data["y"]), f.bind(input_data["x"])]
-        )
-
-    refs = dag.execute({"x": 1, "y": 2})
-    assert len(refs) == 3
-    assert ray.get(refs) == [1, 2, 1]
-
-
-def test_dag_with_actor_handle(shared_ray_instance):
-    """Verify DAG API works with actor created by .remote"""
-
-    @ray.remote
-    class Worker:
-        def __init__(self):
-            self.forward_called = 0
-            self.init_called = 0
-
-        def forward(self, input):
-            print("forward")
-            self.forward_called += 1
-            return input
-
-        def initialize(self, input):
-            print("initialize")
-            self.init_called += 1
-            return input
-
-        def get(self):
-            return (self.forward_called, self.init_called)
-
-    worker = Worker.remote()
-    with InputNode() as input_node:
-        init_dag = worker.initialize.bind(input_node)
-    with InputNode() as input_node:
-        forward_dag = worker.forward.bind(input_node)
-
-    assert ray.get(init_dag.execute(1)) == 1
-    assert ray.get(forward_dag.execute(2)) == 2
-
-    # Make sure both forward/initialize called only once
-    assert ray.get(worker.get.remote()) == (1, 1)
-
-    # Double check the actor is resued.
-    assert ray.get(init_dag.execute(1)) == 1
-    assert ray.get(worker.get.remote()) == (1, 2)
-
-
-def test_tensor_parallel_dag(shared_ray_instance):
-    @ray.remote
-    class Worker:
-        def __init__(self, rank):
-            self.rank = rank
-            self.forwarded = 0
-
-        def forward(self, input_data: int):
-            print(input_data)
-            self.forwarded += 1
-            return self.rank + input_data
-
-        def initialize(self):
-            pass
-
-        def get_forwarded(self):
-            return self.forwarded
-
-    NUM_WORKERS = 4
-    workers = [Worker.remote(i) for i in range(NUM_WORKERS)]
-    # Init multiple times.
-    for _ in range(4):
-        ray.get([worker.initialize.remote() for worker in workers])
-
-    with InputNode() as input_data:
-        dag = OutputNode([worker.forward.bind(input_data) for worker in workers])
-
-    # Run DAG repetitively.
-    ITER = 4
-    assert ITER > 1
-    for i in range(ITER):
-        ref = dag.execute(i)
-        all_outputs = ray.get(ref)
-        assert len(all_outputs) == NUM_WORKERS
-        assert all_outputs == [i + j for j in range(NUM_WORKERS)]
-
-    forwarded = ray.get([worker.get_forwarded.remote() for worker in workers])
-    assert forwarded == [ITER for _ in range(NUM_WORKERS)]
-
-
-if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main(["-v", __file__]))

From 4436b1f55fe5d34b41e0c46f4045de195b5add0f Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 12 Dec 2023 10:33:35 -0800
Subject: [PATCH 56/66] revert

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/class_node.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/python/ray/dag/class_node.py b/python/ray/dag/class_node.py
index 6c07bc2ace1b3..e4a74f83c7495 100644
--- a/python/ray/dag/class_node.py
+++ b/python/ray/dag/class_node.py
@@ -23,6 +23,7 @@ def __init__(
         other_args_to_resolve=None,
     ):
         self._body = cls
+        self._last_call: Optional["ClassMethodNode"] = None
         super().__init__(
             cls_args,
             cls_kwargs,
@@ -102,6 +103,7 @@ def __init__(self, actor: ClassNode, method_name: str, options: dict):
     def bind(self, *args, **kwargs):
         other_args_to_resolve = {
             PARENT_CLASS_NODE_KEY: self._actor,
+            PREV_CLASS_METHOD_CALL_KEY: self._actor._last_call,
         }
 
         node = ClassMethodNode(
@@ -111,6 +113,7 @@ def bind(self, *args, **kwargs):
             self._options,
             other_args_to_resolve=other_args_to_resolve,
         )
+        self._actor._last_call = node
         return node
 
     def __getattr__(self, attr: str):
@@ -199,12 +202,3 @@ def __str__(self) -> str:
 
     def get_method_name(self) -> str:
         return self._method_name
-
-    def _get_remote_method(self, method_name):
-        method_body = getattr(self._parent_class_node, method_name)
-        return method_body
-
-    def _get_actor_handle(self) -> Optional[ReferenceType["ray.actor.ActorHandle"]]:
-        if not isinstance(self._parent_class_node, ray.actor.ActorHandle):
-            return None
-        return self._parent_class_node

From f105ed517d9e70cf75db8c4ea47806655a70ca4a Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 12 Dec 2023 10:34:02 -0800
Subject: [PATCH 57/66] revert

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/tests/test_class_dag.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/python/ray/dag/tests/test_class_dag.py b/python/ray/dag/tests/test_class_dag.py
index 2c94b501d006e..61e09ef85a6a4 100644
--- a/python/ray/dag/tests/test_class_dag.py
+++ b/python/ray/dag/tests/test_class_dag.py
@@ -147,6 +147,13 @@ def combine(x, y):
         .get("name")
         == "a2_v0"
     )
+    # refer to actor method a2.inc.options() call
+    assert (
+        test_a2.get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY]
+        .get_options()
+        .get("name")
+        == "v3"
+    )
     # refer to a1 constructor .options() call
     assert (
         test_a1.get_other_args_to_resolve()[PARENT_CLASS_NODE_KEY]
@@ -154,6 +161,21 @@ def combine(x, y):
         .get("name")
         == "a1_v1"
     )
+    # refer to latest actor method a1.inc.options() call
+    assert (
+        test_a1.get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY]
+        .get_options()
+        .get("name")
+        == "v2"
+    )
+    # refer to first bound actor method a1.inc.options() call
+    assert (
+        test_a1.get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY]
+        .get_other_args_to_resolve()[PREV_CLASS_METHOD_CALL_KEY]
+        .get_options()
+        .get("name")
+        == "v1"
+    )
 
 
 def test_pass_actor_handle(shared_ray_instance):

From 00f3f1cdb71007230d517dee786d20b3e5f2538a Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 12 Dec 2023 10:43:37 -0800
Subject: [PATCH 58/66] x

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/actor.py                          |  5 ++--
 python/ray/dag/__init__.py                   |  2 ++
 python/ray/dag/class_node.py                 |  9 ++++++
 python/ray/dag/compiled_dag_node.py          | 31 ++++++++++++--------
 python/ray/dag/dag_node.py                   |  7 +----
 python/ray/dag/tests/test_accelerated_dag.py |  2 +-
 6 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/python/ray/actor.py b/python/ray/actor.py
index e901ed414c356..4560d72c03de9 100644
--- a/python/ray/actor.py
+++ b/python/ray/actor.py
@@ -29,7 +29,6 @@
     PythonFunctionDescriptor,
     raise_sys_exit_with_custom_error_message,
 )
-from ray.dag.class_node import PARENT_CLASS_NODE_KEY, ClassMethodNode
 from ray.exceptions import AsyncioActorExit
 from ray.util.annotations import DeveloperAPI, PublicAPI
 from ray.util.placement_group import _configure_placement_group_based_on_context
@@ -152,7 +151,7 @@ def __init__(
         decorator=None,
         hardref=False,
     ):
-        self._actor_ref = weakref.proxy(actor)
+        self._actor_ref = weakref.ref(actor)
         self._method_name = method_name
         self._num_returns = num_returns
 
@@ -319,7 +318,7 @@ def invocation(args, kwargs):
 
     def __getstate__(self):
         return {
-            "actor": self._actor_ref,
+            "actor": self._actor_ref(),
             "method_name": self._method_name,
             "num_returns": self._num_returns,
             "max_retries": self._max_retries,
diff --git a/python/ray/dag/__init__.py b/python/ray/dag/__init__.py
index 726d5930b17d0..e74f57245c097 100644
--- a/python/ray/dag/__init__.py
+++ b/python/ray/dag/__init__.py
@@ -9,6 +9,7 @@
 from ray.dag.output_node import MultiOutputNode
 from ray.dag.constants import (
     PARENT_CLASS_NODE_KEY,
+    PREV_CLASS_METHOD_CALL_KEY,
     DAGNODE_TYPE_KEY,
 )
 from ray.dag.vis_utils import plot
@@ -22,6 +23,7 @@
     "InputAttributeNode",
     "DAGInputData",
     "PARENT_CLASS_NODE_KEY",
+    "PREV_CLASS_METHOD_CALL_KEY",
     "DAGNODE_TYPE_KEY",
     "plot",
     "MultiOutputNode",
diff --git a/python/ray/dag/class_node.py b/python/ray/dag/class_node.py
index e4a74f83c7495..84a2f9dac1963 100644
--- a/python/ray/dag/class_node.py
+++ b/python/ray/dag/class_node.py
@@ -202,3 +202,12 @@ def __str__(self) -> str:
 
     def get_method_name(self) -> str:
         return self._method_name
+
+    def _get_remote_method(self, method_name):
+        method_body = getattr(self._parent_class_node, method_name)
+        return method_body
+
+    def _get_actor_handle(self) -> Optional[ReferenceType["ray.actor.ActorHandle"]]:
+        if not isinstance(self._parent_class_node, ray.actor.ActorHandle):
+            return None
+        return self._parent_class_node
diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
index 17f6362b6bfd7..dac78303f1ee3 100644
--- a/python/ray/dag/compiled_dag_node.py
+++ b/python/ray/dag/compiled_dag_node.py
@@ -92,14 +92,14 @@ def __init__(self, idx: int, dag_node: "ray.dag.DAGNode"):
         self.output_channel = None
 
     @property
-    def args(self):
+    def args(self) -> Tuple[Any]:
         return self.dag_node.get_args()
 
     @property
-    def num_readers(self):
+    def num_readers(self) -> int:
         return len(self.downstream_node_idxs)
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"""
 Node: {self.dag_node}
 Arguments: {self.args}
@@ -108,6 +108,11 @@ def __str__(self):
 
 
 class CompiledDAG:
+    """Experimental class for accelerated execution.
+
+    See REP https://github.com/ray-project/enhancements/pull/48 for more
+    information.
+    """
     def __init__(self):
         # idx -> CompiledTask.
         self.idx_to_task = {}
@@ -165,13 +170,13 @@ def _preprocess(self) -> None:
             ):
                 if isinstance(dag_node, InputAttributeNode):
                     # TODO(swang): Support multi args.
-                    raise ValueError(
+                    raise NotImplementedError(
                         "Compiled DAGs currently do not support kwargs or "
                         "multiple args for InputNode"
                     )
                 elif isinstance(dag_node, FunctionNode):
                     # TODO(swang): Support non-actor tasks.
-                    raise ValueError(
+                    raise NotImplementedError(
                         "Compiled DAGs currently only support actor method nodes"
                     )
                 else:
@@ -195,7 +200,7 @@ def _preprocess(self) -> None:
 
         for actor_id, task_count in self.actor_task_count.items():
             if task_count > 1:
-                raise ValueError(
+                raise NotImplementedError(
                     "Compiled DAGs can contain at most one task per actor handle. "
                     f"Actor with ID {actor_id} appears {task_count}x."
                 )
@@ -207,7 +212,7 @@ def _preprocess(self) -> None:
                 self.input_task_idx = idx
         # TODO: Support no-input DAGs (use an empty object to signal).
         if self.input_task_idx is None:
-            raise ValueError("Compiled DAGs currently require exactly one InputNode")
+            raise NotImplementedError("Compiled DAGs currently require exactly one InputNode")
 
         # Find the (multi-)output node to the DAG.
         for idx, task in self.idx_to_task.items():
@@ -227,7 +232,7 @@ def _preprocess(self) -> None:
             # now.
             self._preprocess()
 
-    def _compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]:
+    def _get_or_compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]:
         """Compile an execution path. This allocates channels for adjacent
         tasks to send/receive values. An infinite task is submitted to each
         actor in the DAG that repeatedly receives from input channel(s) and
@@ -363,16 +368,16 @@ def execute(
         # These errors should already be caught during compilation, but just in
         # case.
         if len(args) != 1:
-            raise ValueError("Compiled DAGs support exactly one InputNode arg")
+            raise NotImplementedError("Compiled DAGs support exactly one InputNode arg")
         if len(kwargs) != 0:
-            raise ValueError("Compiled DAGs do not support kwargs")
+            raise NotImplementedError("Compiled DAGs do not support kwargs")
 
-        input_channel, output_channels = self._compile()
+        input_channel, output_channels = self._get_or_compile()
         input_channel.write(args[0])
         return output_channels
 
 
-def build_compiled_dag_from_ray_dag(dag: "ray.dag.DAGNode"):
+def build_compiled_dag_from_ray_dag(dag: "ray.dag.DAGNode") -> "CompiledDAG":
     compiled_dag = CompiledDAG()
 
     def _build_compiled_dag(node):
@@ -380,5 +385,5 @@ def _build_compiled_dag(node):
         return node
 
     dag.apply_recursive(_build_compiled_dag)
-    compiled_dag._compile()
+    compiled_dag._get_or_compile()
     return compiled_dag
diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py
index 6d15889eea8c2..7da478de68f32 100644
--- a/python/ray/dag/dag_node.py
+++ b/python/ray/dag/dag_node.py
@@ -61,8 +61,6 @@ def __init__(
         # Cached values from last call to execute()
         self.cache_from_last_execute = {}
 
-        self._compiled_dag = None
-
     def get_args(self) -> Tuple[Any]:
         """Return the tuple of arguments for this node."""
 
@@ -111,10 +109,7 @@ def experimental_compile(self) -> "ray.dag.CompiledDAG":
         """Compile an accelerated execution path for this DAG. The compiled DAG
         is cached.
         """
-        if self._compiled_dag is None:
-            self._compiled_dag = build_compiled_dag_from_ray_dag(self)
-
-        return self._compiled_dag
+        return build_compiled_dag_from_ray_dag(self)
 
     def execute(
         self, *args, _ray_cache_refs: bool = False, **kwargs
diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py
index 13a90ceef3b13..0687607d93348 100644
--- a/python/ray/dag/tests/test_accelerated_dag.py
+++ b/python/ray/dag/tests/test_accelerated_dag.py
@@ -15,7 +15,7 @@
 logger = logging.getLogger(__name__)
 
 
-@ray.remote(concurrency_groups={"_ray_system": 1})
+@ray.remote
 class Actor:
     def __init__(self, init_value):
         print("__init__ PID", os.getpid())

From 257457d5ff09ebf8d01231ad6fe0c84ed0b8daa1 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 12 Dec 2023 10:52:55 -0800
Subject: [PATCH 59/66] buffer size bytes

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/compiled_dag_node.py | 37 +++++++++++++++++------------
 python/ray/dag/dag_node.py          | 14 +++++++----
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
index dac78303f1ee3..ebc3d6611f764 100644
--- a/python/ray/dag/compiled_dag_node.py
+++ b/python/ray/dag/compiled_dag_node.py
@@ -15,7 +15,7 @@
 
 
 def do_allocate_channel(
-    self, buffer_size_bytes: int = MAX_BUFFER_SIZE, num_readers: int = 1
+    self, buffer_size_bytes: int, num_readers: int = 1
 ) -> ChannelType:
     """Generic actor method to allocate an output channel.
 
@@ -113,27 +113,33 @@ class CompiledDAG:
     See REP https://github.com/ray-project/enhancements/pull/48 for more
     information.
     """
-    def __init__(self):
+    def __init__(self, buffer_size_bytes: Optional[int]):
+        self._buffer_size_bytes : Optional[int] = buffer_size_bytes
+        if self._buffer_size_bytes is None:
+            self._buffer_size_bytes = MAX_BUFFER_SIZE
+        if not isinstance(self._buffer_size_bytes, int) or self._buffer_size_bytes <= 0:
+            raise ValueError(f"`buffer_size_bytes` must be a positive integer, found {self._buffer_size_bytes}")
+
         # idx -> CompiledTask.
-        self.idx_to_task = {}
+        self.idx_to_task : Dict[int, "CompiledTask"] = {}
         # DAGNode -> idx.
-        self.dag_node_to_idx = {}
+        self.dag_node_to_idx : Dict["ray.dag.DAGNode", int] = {}
         # idx counter.
-        self.counter = 0
+        self.counter : int = 0
 
         # Attributes that are set during preprocessing.
         # Preprocessing identifies the input node and output node.
-        self.input_task_idx = None
-        self.output_task_idx = None
-        self.has_single_output = False
-        self.actor_task_count = defaultdict(int)
+        self.input_task_idx : Optional[int] = None
+        self.output_task_idx : Optional[int] = None
+        self.has_single_output : bool = False
+        self.actor_task_count : Dict["ray._raylet.ActorID", int] = defaultdict(int)
 
         # Cached attributes that are set during compilation.
-        self.dag_input_channel = None
-        self.dag_output_channels = None
+        self.dag_input_channel : Optional[ChannelType] = None
+        self.dag_output_channels : Optional[ChannelType] = None
         # ObjectRef for each worker's task. The task is an infinite loop that
         # repeatedly executes the method specified in the DAG.
-        self.worker_task_refs = []
+        self.worker_task_refs : List["ray.ObjectRef"] = []
 
     def _add_node(self, node: "ray.dag.DAGNode") -> None:
         idx = self.counter
@@ -277,12 +283,13 @@ def _get_or_compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelT
                 task.output_channel = ray.get(
                     fn.remote(
                         do_allocate_channel,
+                        buffer_size_bytes=self._buffer_size_bytes,
                         num_readers=task.num_readers,
                     )
                 )
             elif isinstance(task.dag_node, InputNode):
                 task.output_channel = ray_channel.Channel(
-                    buffer_size_bytes=MAX_BUFFER_SIZE, num_readers=task.num_readers
+                    buffer_size_bytes=self._buffer_size_bytes, num_readers=task.num_readers
                 )
             else:
                 assert isinstance(task.dag_node, OutputNode)
@@ -377,8 +384,8 @@ def execute(
         return output_channels
 
 
-def build_compiled_dag_from_ray_dag(dag: "ray.dag.DAGNode") -> "CompiledDAG":
-    compiled_dag = CompiledDAG()
+def build_compiled_dag_from_ray_dag(dag: "ray.dag.DAGNode", buffer_size_bytes: Optional[int]) -> "CompiledDAG":
+    compiled_dag = CompiledDAG(buffer_size_bytes)
 
     def _build_compiled_dag(node):
         compiled_dag._add_node(node)
diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py
index 7da478de68f32..50136e6d855d5 100644
--- a/python/ray/dag/dag_node.py
+++ b/python/ray/dag/dag_node.py
@@ -105,11 +105,17 @@ async def get_object_refs_from_last_execute(self) -> Dict[str, Any]:
     def clear_cache(self):
         self.cache_from_last_execute = {}
 
-    def experimental_compile(self) -> "ray.dag.CompiledDAG":
-        """Compile an accelerated execution path for this DAG. The compiled DAG
-        is cached.
+    def experimental_compile(self, buffer_size_bytes: Optional[int] = None) -> "ray.dag.CompiledDAG":
+        """Compile an accelerated execution path for this DAG.
+
+        Args:
+            buffer_size_bytes: The maximum size of messages that can be passed
+                between tasks in the DAG.
+
+        Returns:
+            A compiled DAG.
         """
-        return build_compiled_dag_from_ray_dag(self)
+        return build_compiled_dag_from_ray_dag(self, buffer_size_bytes)
 
     def execute(
         self, *args, _ray_cache_refs: bool = False, **kwargs

From 71c32ae659e89d20de87a97e122d5e23bbad5d9a Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 12 Dec 2023 19:14:26 -0800
Subject: [PATCH 60/66] optional

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/compiled_dag_node.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
index ebc3d6611f764..3abb813ebe1f3 100644
--- a/python/ray/dag/compiled_dag_node.py
+++ b/python/ray/dag/compiled_dag_node.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, List, Tuple, Union
+from typing import Any, List, Tuple, Union, Optional
 
 from collections import defaultdict
 

From 2ba93f0f5041d853714d3beae92eb23c1b4c213f Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 12 Dec 2023 21:20:49 -0800
Subject: [PATCH 61/66] x

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/compiled_dag_node.py | 61 +++++++++++++++++------------
 python/ray/dag/dag_node.py          |  4 +-
 2 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
index 3abb813ebe1f3..82d0a3901b546 100644
--- a/python/ray/dag/compiled_dag_node.py
+++ b/python/ray/dag/compiled_dag_node.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, List, Tuple, Union, Optional
+from typing import Any, Dict, List, Tuple, Union, Optional
 
 from collections import defaultdict
 
@@ -113,33 +113,37 @@ class CompiledDAG:
     See REP https://github.com/ray-project/enhancements/pull/48 for more
     information.
     """
+
     def __init__(self, buffer_size_bytes: Optional[int]):
-        self._buffer_size_bytes : Optional[int] = buffer_size_bytes
+        self._buffer_size_bytes: Optional[int] = buffer_size_bytes
         if self._buffer_size_bytes is None:
             self._buffer_size_bytes = MAX_BUFFER_SIZE
         if not isinstance(self._buffer_size_bytes, int) or self._buffer_size_bytes <= 0:
-            raise ValueError(f"`buffer_size_bytes` must be a positive integer, found {self._buffer_size_bytes}")
+            raise ValueError(
+                "`buffer_size_bytes` must be a positive integer, found "
+                f"{self._buffer_size_bytes}"
+            )
 
         # idx -> CompiledTask.
-        self.idx_to_task : Dict[int, "CompiledTask"] = {}
+        self.idx_to_task: Dict[int, "CompiledTask"] = {}
         # DAGNode -> idx.
-        self.dag_node_to_idx : Dict["ray.dag.DAGNode", int] = {}
+        self.dag_node_to_idx: Dict["ray.dag.DAGNode", int] = {}
         # idx counter.
-        self.counter : int = 0
+        self.counter: int = 0
 
         # Attributes that are set during preprocessing.
         # Preprocessing identifies the input node and output node.
-        self.input_task_idx : Optional[int] = None
-        self.output_task_idx : Optional[int] = None
-        self.has_single_output : bool = False
-        self.actor_task_count : Dict["ray._raylet.ActorID", int] = defaultdict(int)
+        self.input_task_idx: Optional[int] = None
+        self.output_task_idx: Optional[int] = None
+        self.has_single_output: bool = False
+        self.actor_task_count: Dict["ray._raylet.ActorID", int] = defaultdict(int)
 
         # Cached attributes that are set during compilation.
-        self.dag_input_channel : Optional[ChannelType] = None
-        self.dag_output_channels : Optional[ChannelType] = None
+        self.dag_input_channel: Optional[ChannelType] = None
+        self.dag_output_channels: Optional[ChannelType] = None
         # ObjectRef for each worker's task. The task is an infinite loop that
         # repeatedly executes the method specified in the DAG.
-        self.worker_task_refs : List["ray.ObjectRef"] = []
+        self.worker_task_refs: List["ray.ObjectRef"] = []
 
     def _add_node(self, node: "ray.dag.DAGNode") -> None:
         idx = self.counter
@@ -160,7 +164,7 @@ def _preprocess(self) -> None:
             FunctionNode,
             InputAttributeNode,
             InputNode,
-            OutputNode,
+            MultiOutputNode,
         )
 
         self.input_task_idx, self.output_task_idx = None, None
@@ -171,7 +175,7 @@ def _preprocess(self) -> None:
             dag_node = task.dag_node
             if not (
                 isinstance(dag_node, InputNode)
-                or isinstance(dag_node, OutputNode)
+                or isinstance(dag_node, MultiOutputNode)
                 or isinstance(dag_node, ClassMethodNode)
             ):
                 if isinstance(dag_node, InputAttributeNode):
@@ -218,7 +222,9 @@ def _preprocess(self) -> None:
                 self.input_task_idx = idx
         # TODO: Support no-input DAGs (use an empty object to signal).
         if self.input_task_idx is None:
-            raise NotImplementedError("Compiled DAGs currently require exactly one InputNode")
+            raise NotImplementedError(
+                "Compiled DAGs currently require exactly one InputNode"
+            )
 
         # Find the (multi-)output node to the DAG.
         for idx, task in self.idx_to_task.items():
@@ -228,17 +234,19 @@ def _preprocess(self) -> None:
 
         assert self.output_task_idx is not None
         output_node = self.idx_to_task[self.output_task_idx].dag_node
-        # Add an OutputNode to the end of the DAG if it's not already there.
-        if not isinstance(output_node, OutputNode):
+        # Add an MultiOutputNode to the end of the DAG if it's not already there.
+        if not isinstance(output_node, MultiOutputNode):
             self.has_single_output = True
-            output_node = OutputNode([output_node])
+            output_node = MultiOutputNode([output_node])
             self._add_node(output_node)
             self.output_task_idx = self.dag_node_to_idx[output_node]
             # Preprocess one more time so that we have the right output node
             # now.
             self._preprocess()
 
-    def _get_or_compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]:
+    def _get_or_compile(
+        self,
+    ) -> Tuple[ChannelType, Union[ChannelType, List[ChannelType]]]:
         """Compile an execution path. This allocates channels for adjacent
         tasks to send/receive values. An infinite task is submitted to each
         actor in the DAG that repeatedly receives from input channel(s) and
@@ -253,7 +261,7 @@ def _get_or_compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelT
             output channel(s) should be read by the caller to get the DAG
             output.
         """
-        from ray.dag import DAGNode, InputNode, OutputNode, ClassMethodNode
+        from ray.dag import DAGNode, InputNode, MultiOutputNode, ClassMethodNode
 
         if self.input_task_idx is None:
             self._preprocess()
@@ -289,10 +297,11 @@ def _get_or_compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelT
                 )
             elif isinstance(task.dag_node, InputNode):
                 task.output_channel = ray_channel.Channel(
-                    buffer_size_bytes=self._buffer_size_bytes, num_readers=task.num_readers
+                    buffer_size_bytes=self._buffer_size_bytes,
+                    num_readers=task.num_readers,
                 )
             else:
-                assert isinstance(task.dag_node, OutputNode)
+                assert isinstance(task.dag_node, MultiOutputNode)
 
             for idx in task.downstream_node_idxs:
                 queue.append(idx)
@@ -348,7 +357,7 @@ def _get_or_compile(self) -> Tuple[ChannelType, Union[ChannelType, List[ChannelT
         assert [
             output_channel is not None for output_channel in self.dag_output_channels
         ]
-        # If no OutputNode was specified during the DAG creation, there is only
+        # If no MultiOutputNode was specified during the DAG creation, there is only
         # one output. Return a single output channel instead of a list of
         # channels.
         if self.has_single_output:
@@ -384,7 +393,9 @@ def execute(
         return output_channels
 
 
-def build_compiled_dag_from_ray_dag(dag: "ray.dag.DAGNode", buffer_size_bytes: Optional[int]) -> "CompiledDAG":
+def build_compiled_dag_from_ray_dag(
+    dag: "ray.dag.DAGNode", buffer_size_bytes: Optional[int]
+) -> "CompiledDAG":
     compiled_dag = CompiledDAG(buffer_size_bytes)
 
     def _build_compiled_dag(node):
diff --git a/python/ray/dag/dag_node.py b/python/ray/dag/dag_node.py
index 50136e6d855d5..5acfbd02f92cf 100644
--- a/python/ray/dag/dag_node.py
+++ b/python/ray/dag/dag_node.py
@@ -105,7 +105,9 @@ async def get_object_refs_from_last_execute(self) -> Dict[str, Any]:
     def clear_cache(self):
         self.cache_from_last_execute = {}
 
-    def experimental_compile(self, buffer_size_bytes: Optional[int] = None) -> "ray.dag.CompiledDAG":
+    def experimental_compile(
+        self, buffer_size_bytes: Optional[int] = None
+    ) -> "ray.dag.CompiledDAG":
         """Compile an accelerated execution path for this DAG.
 
         Args:

From ac5fa5563fa604e46e84299fc6cc5dd9f6b31d77 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Tue, 12 Dec 2023 21:32:13 -0800
Subject: [PATCH 62/66] x

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/tests/test_accelerated_dag.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/python/ray/dag/tests/test_accelerated_dag.py b/python/ray/dag/tests/test_accelerated_dag.py
index 0687607d93348..b293ee8940267 100644
--- a/python/ray/dag/tests/test_accelerated_dag.py
+++ b/python/ray/dag/tests/test_accelerated_dag.py
@@ -14,6 +14,9 @@
 
 logger = logging.getLogger(__name__)
 
+if sys.platform != "linux":
+    pytest.skip("Skipping, requires Linux.", allow_module_level=True)
+
 
 @ray.remote
 class Actor:
@@ -118,7 +121,8 @@ def test_dag_errors(ray_start_regular):
     a = Actor.remote(0)
     dag = a.inc.bind(1)
     with pytest.raises(
-        ValueError, match="Compiled DAGs currently require exactly one InputNode"
+        NotImplementedError,
+        match="Compiled DAGs currently require exactly one InputNode",
     ):
         dag.experimental_compile()
 
@@ -139,7 +143,8 @@ def f(x):
     with InputNode() as inp:
         dag = f.bind(inp)
     with pytest.raises(
-        ValueError, match="Compiled DAGs currently only support actor method nodes"
+        NotImplementedError,
+        match="Compiled DAGs currently only support actor method nodes",
     ):
         dag.experimental_compile()
 
@@ -147,14 +152,15 @@ def f(x):
         dag = a.inc.bind(inp)
         dag = a.inc.bind(dag)
     with pytest.raises(
-        ValueError, match="Compiled DAGs can contain at most one task per actor handle."
+        NotImplementedError,
+        match="Compiled DAGs can contain at most one task per actor handle.",
     ):
         dag.experimental_compile()
 
     with InputNode() as inp:
         dag = a.inc_two.bind(inp[0], inp[1])
     with pytest.raises(
-        ValueError,
+        NotImplementedError,
         match="Compiled DAGs currently do not support kwargs or multiple args "
         "for InputNode",
     ):
@@ -163,7 +169,7 @@ def f(x):
     with InputNode() as inp:
         dag = a.inc_two.bind(inp.x, inp.y)
     with pytest.raises(
-        ValueError,
+        NotImplementedError,
         match="Compiled DAGs currently do not support kwargs or multiple args "
         "for InputNode",
     ):

From 35a37fdb3cda3bdcdb48987b768f34cfb5fc6535 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 13 Dec 2023 10:43:57 -0800
Subject: [PATCH 63/66] lint?

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/class_node.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/dag/class_node.py b/python/ray/dag/class_node.py
index 84a2f9dac1963..64f4615e1aada 100644
--- a/python/ray/dag/class_node.py
+++ b/python/ray/dag/class_node.py
@@ -207,7 +207,7 @@ def _get_remote_method(self, method_name):
         method_body = getattr(self._parent_class_node, method_name)
         return method_body
 
-    def _get_actor_handle(self) -> Optional[ReferenceType["ray.actor.ActorHandle"]]:
+    def _get_actor_handle(self) -> Optional["ray.actor.ActorHandle"]:
         if not isinstance(self._parent_class_node, ray.actor.ActorHandle):
             return None
         return self._parent_class_node

From 13263318685aa43a9e1e4e91631b7233410d870f Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 13 Dec 2023 13:24:22 -0800
Subject: [PATCH 64/66] test

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/dag/BUILD b/python/ray/dag/BUILD
index 164791d0fcd2e..4cb5e5634a13b 100644
--- a/python/ray/dag/BUILD
+++ b/python/ray/dag/BUILD
@@ -71,7 +71,7 @@ py_test(
 
 py_test(
     name = "test_accelerated_dag",
-    size = "small",
+    size = "medium",
     srcs = dag_tests_srcs,
     tags = ["exclusive", "team:core", "ray_dag_tests"],
     deps = [":dag_lib"],

From fadec0788ebdf4d91e774ee4b96875b8ad0a2a49 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 13 Dec 2023 13:48:19 -0800
Subject: [PATCH 65/66] API

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/compiled_dag_node.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
index 82d0a3901b546..bccf77389b9f1 100644
--- a/python/ray/dag/compiled_dag_node.py
+++ b/python/ray/dag/compiled_dag_node.py
@@ -14,6 +14,7 @@
 logger = logging.getLogger(__name__)
 
 
+@DeveloperAPI
 def do_allocate_channel(
     self, buffer_size_bytes: int, num_readers: int = 1
 ) -> ChannelType:
@@ -30,6 +31,7 @@ def do_allocate_channel(
     return self._output_channel
 
 
+@DeveloperAPI
 def do_exec_compiled_task(
     self,
     inputs: List[Union[Any, ChannelType]],
@@ -76,6 +78,7 @@ def do_exec_compiled_task(
         raise
 
 
+@DeveloperAPI
 class CompiledTask:
     """Wraps the normal Ray DAGNode with some metadata."""
 
@@ -107,9 +110,13 @@ def __str__(self) -> str:
 """
 
 
+@DeveloperAPI
 class CompiledDAG:
     """Experimental class for accelerated execution.
 
+    This class should not be called directly. Instead, create
+    a ray.dag and call experimental_compile().
+
     See REP https://github.com/ray-project/enhancements/pull/48 for more
     information.
     """
@@ -393,6 +400,7 @@ def execute(
         return output_channels
 
 
+@DeveloperAPI
 def build_compiled_dag_from_ray_dag(
     dag: "ray.dag.DAGNode", buffer_size_bytes: Optional[int]
 ) -> "CompiledDAG":

From ff19557067e7528a88d8a10cb5bbe10ea852cae0 Mon Sep 17 00:00:00 2001
From: Stephanie Wang <swang@cs.berkeley.edu>
Date: Wed, 13 Dec 2023 15:05:31 -0800
Subject: [PATCH 66/66] x

Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
---
 python/ray/dag/compiled_dag_node.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
index bccf77389b9f1..4e86b5686f531 100644
--- a/python/ray/dag/compiled_dag_node.py
+++ b/python/ray/dag/compiled_dag_node.py
@@ -5,6 +5,7 @@
 
 import ray
 import ray.experimental.channel as ray_channel
+from ray.util.annotations import DeveloperAPI
 
 
 MAX_BUFFER_SIZE = int(100 * 1e6)  # 100MB