ray-project · Dec 8, 2023
diff --git a/‎rllib/BUILD
+42-2 b/‎rllib/BUILD
+42-2
diff --git a/‎rllib/algorithms/algorithm.py
+3 b/‎rllib/algorithms/algorithm.py
+3
diff --git a/‎rllib/algorithms/impala/impala.py
+30-4 b/‎rllib/algorithms/impala/impala.py
+30-4
diff --git a/‎rllib/algorithms/tests/test_callbacks.py
+18-10 b/‎rllib/algorithms/tests/test_callbacks.py
+18-10
diff --git a/‎rllib/evaluation/collectors/agent_collector.py
+1-2 b/‎rllib/evaluation/collectors/agent_collector.py
+1-2
diff --git a/‎rllib/evaluation/collectors/simple_list_collector.py
-3 b/‎rllib/evaluation/collectors/simple_list_collector.py
-3
diff --git a/‎rllib/examples/env/cartpole_crashing.py
+117-22 b/‎rllib/examples/env/cartpole_crashing.py
+117-22
diff --git a/‎rllib/execution/multi_gpu_learner_thread.py
+6-1 b/‎rllib/execution/multi_gpu_learner_thread.py
+6-1
diff --git a/‎rllib/policy/eager_tf_policy_v2.py
+1-2 b/‎rllib/policy/eager_tf_policy_v2.py
+1-2
diff --git a/‎rllib/tuned_examples/appo/cartpole-crashing-and-stalling-recreate-workers-appo.py
+74 b/‎rllib/tuned_examples/appo/cartpole-crashing-and-stalling-recreate-workers-appo.py
+74
diff --git a/‎rllib/tuned_examples/appo/cartpole-crashing-recreate-workers-appo.py
+62 b/‎rllib/tuned_examples/appo/cartpole-crashing-recreate-workers-appo.py
+62
diff --git a/‎rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py
+70 b/‎rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py
+70
diff --git a/‎rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py
+63 b/‎rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py
+63
diff --git a/‎rllib/tuned_examples/appo/multi-agent-cartpole-crashing-restart-env-appo.yaml
-53 b/‎rllib/tuned_examples/appo/multi-agent-cartpole-crashing-restart-env-appo.yaml
-53
diff --git a/‎rllib/tuned_examples/pg/cartpole-crashing-pg.yaml
-45 b/‎rllib/tuned_examples/pg/cartpole-crashing-pg.yaml
-45
diff --git a/‎rllib/tuned_examples/pg/cartpole-crashing-with-remote-envs-pg.yaml
-47 b/‎rllib/tuned_examples/pg/cartpole-crashing-with-remote-envs-pg.yaml
-47
diff --git a/‎rllib/tuned_examples/pg/multi-agent-cartpole-crashing-restart-sub-envs-pg.yaml
-47 b/‎rllib/tuned_examples/pg/multi-agent-cartpole-crashing-restart-sub-envs-pg.yaml
-47
diff --git a/‎rllib/tuned_examples/pg/multi-agent-cartpole-crashing-with-remote-envs-pg.yaml
-49 b/‎rllib/tuned_examples/pg/multi-agent-cartpole-crashing-with-remote-envs-pg.yaml
-49
diff --git a/‎rllib/utils/actor_manager.py
+33-7 b/‎rllib/utils/actor_manager.py
+33-7
@@ -222,6 +222,48 @@ py_test(
     args = ["--dir=tuned_examples/appo"]
 )
 
+# Tests against crashing or hanging environments.
+# Single-agent: Crash only.
+py_test(
+    name = "learning_tests_cartpole_crashing_appo",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "crashing_cartpole"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/appo/cartpole-crashing-recreate-workers-appo.py"],
+    args = ["--dir=tuned_examples/appo", "--num-cpus=6"]
+)
+# Single-agent: Crash and stall.
+py_test(
+    name = "learning_tests_cartpole_crashing_and_stalling_appo",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "crashing_cartpole"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/appo/cartpole-crashing-and-stalling-recreate-workers-appo.py"],
+    args = ["--dir=tuned_examples/appo", "--num-cpus=6"]
+)
+# Multi-agent: Crash only.
+py_test(
+    name = "learning_tests_multi_agent_cartpole_crashing_appo",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "crashing_cartpole"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py"],
+    args = ["--dir=tuned_examples/appo", "--num-cpus=6"]
+)
+# Multi-agent: Crash and stall.
+py_test(
+    name = "learning_tests_multi_agent_cartpole_crashing_and_stalling_appo",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "crashing_cartpole"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py"],
+    args = ["--dir=tuned_examples/appo", "--num-cpus=6"]
+)
+
 # CQL
 py_test(
     name = "learning_tests_pendulum_cql",
@@ -1569,7 +1611,6 @@ py_test(
     args = ["TestCheckpointRestorePPO"]
 )
 
-
 py_test(
     name = "tests/test_checkpoint_restore_ppo_gpu",
     main = "tests/test_algorithm_checkpoint_restore.py",
@@ -1588,7 +1629,6 @@ py_test(
     args = ["TestCheckpointRestoreOffPolicy"]
 )
 
-
 py_test(
     name = "tests/test_checkpoint_restore_off_policy_gpu",
     main = "tests/test_algorithm_checkpoint_restore.py",
 
@@ -1564,6 +1564,9 @@ def restore_workers(self, workers: WorkerSet) -> None:
         restored = workers.probe_unhealthy_workers()
 
         if restored:
+            # Count the restored workers.
+            self._counters["total_num_restored_workers"] += len(restored)
+
             from_worker = workers.local_worker() or self.workers.local_worker()
             # Get the state of the correct (reference) worker. E.g. The local worker
             # of the main WorkerSet.
 
@@ -860,12 +860,11 @@ def default_resource_request(
             strategy=cf.placement_strategy,
         )
 
-    def concatenate_batches_and_pre_queue(self, batches: List[SampleBatch]):
+    def concatenate_batches_and_pre_queue(self, batches: List[SampleBatch]) -> None:
         """Concatenate batches that are being returned from rollout workers
 
         Args:
-            batches: batches of experiences from rollout workers
-
+            batches: List of batches of experiences from EnvRunners.
         """
 
         def aggregate_into_larger_batch():
@@ -878,6 +877,33 @@ def aggregate_into_larger_batch():
                 self.batch_being_built = []
 
         for batch in batches:
+            # TODO (sven): Strange bug in tf/tf2 after a RolloutWorker crash and proper
+            #  restart. The bug is related to (old, non-V2) connectors being used and
+            #  seems to happen inside the AgentCollector's `add_action_reward_next_obs`
+            #  method, at the end of which the number of vf_preds (and all other
+            #  extra action outs) in the batch is one smaller than the number of obs/
+            #  actions/rewards, which leads to a malformed train batch. IMPALA/APPO then
+            #  crash inside the loss function (during v-trace operations). The following
+            #  if-block prevents this from happening and it can be removed once we are
+            #  on the new API stack for good (and use the new connectors and also no
+            #  longer AgentCollectors, RolloutWorkers, Policies, TrajectoryView API,
+            #  etc..):
+            if (
+                self.config.batch_mode == "truncate_episodes"
+                and self.config.enable_connectors
+                and self.config.recreate_failed_workers
+                and self.config.framework_str in ["tf", "tf2"]
+            ):
+                if any(
+                    SampleBatch.VF_PREDS in pb
+                    and (
+                        pb[SampleBatch.VF_PREDS].shape[0]
+                        != pb[SampleBatch.REWARDS].shape[0]
+                    )
+                    for pb in batch.policy_batches.values()
+                ):
+                    continue
+
             self.batch_being_built.append(batch)
             aggregate_into_larger_batch()
 
@@ -929,7 +955,7 @@ def get_samples_from_workers(
                 sample_batches = [(0, sample_batch)]
             else:
                 # Not much we can do. Return empty list and wait.
-                return []
+                sample_batches = []
 
         return sample_batches
 
 
@@ -11,9 +11,10 @@
 from ray.rllib.evaluation.episode import Episode
 from ray.rllib.examples.env.random_env import RandomEnv
 from ray.rllib.utils.test_utils import framework_iterator
+from ray import tune
 
 
-class OnWorkerCreatedCallbacks(DefaultCallbacks):
+class OnWorkersRecreatedCallbacks(DefaultCallbacks):
     def on_workers_recreated(
         self,
         *,
@@ -109,11 +110,13 @@ def tearDownClass(cls):
         ray.shutdown()
 
     def test_on_workers_recreated_callback(self):
+        tune.register_env("env", lambda cfg: CartPoleCrashing(cfg))
+
         config = (
             APPOConfig()
-            .environment(CartPoleCrashing)
-            .callbacks(OnWorkerCreatedCallbacks)
-            .rollouts(num_rollout_workers=2)
+            .environment("env")
+            .callbacks(OnWorkersRecreatedCallbacks)
+            .rollouts(num_rollout_workers=3)
             .fault_tolerance(recreate_failed_workers=True)
         )
 
@@ -122,19 +125,24 @@ def test_on_workers_recreated_callback(self):
             original_worker_ids = algo.workers.healthy_worker_ids()
             for id_ in original_worker_ids:
                 self.assertTrue(algo._counters[f"worker_{id_}_recreated"] == 0)
+            self.assertTrue(algo._counters["total_num_workers_recreated"] == 0)
 
             # After building the algorithm, we should have 2 healthy (remote) workers.
-            self.assertTrue(len(original_worker_ids) == 2)
+            self.assertTrue(len(original_worker_ids) == 3)
 
             # Train a bit (and have the envs/workers crash a couple of times).
-            for _ in range(3):
-                algo.train()
+            for _ in range(5):
+                print(algo.train())
 
-            # After training, each new worker should have been recreated at least once.
+            # After training, the `on_workers_recreated` callback should have captured
+            # the exact worker IDs recreated (the exact number of times) as the actor
+            # manager itself. This confirms that the callback is triggered correctly,
+            # always.
             new_worker_ids = algo.workers.healthy_worker_ids()
-            self.assertTrue(len(new_worker_ids) == 2)
+            self.assertTrue(len(new_worker_ids) == 3)
             for id_ in new_worker_ids:
-                self.assertTrue(algo._counters[f"worker_{id_}_recreated"] >= 1)
+                # num_restored = algo.workers.restored_actors_history[id_]
+                self.assertTrue(algo._counters[f"worker_{id_}_recreated"] > 1)
             algo.stop()
 
     def test_on_init_and_checkpoint_loaded(self):
 
@@ -9,7 +9,7 @@
 
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.view_requirement import ViewRequirement
-from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.spaces.space_utils import (
     flatten_to_single_ndarray,
     get_dummy_batch_for_space,
@@ -24,7 +24,6 @@
 
 logger = logging.getLogger(__name__)
 
-_, tf, _ = try_import_tf()
 torch, _ = try_import_torch()
 
 
 
@@ -426,9 +426,6 @@ def postprocess_episode(
         episode_id = episode.episode_id
         policy_collector_group = episode.batch_builder
 
-        # TODO: (sven) Once we implement multi-agent communication channels,
-        #  we have to resolve the restriction of only sending other agent
-        #  batches from the same policy to the postprocess methods.
         # Build SampleBatches for the given episode.
         pre_batches = {}
         for (eps_id, agent_id), collector in self.agent_collectors.items():
 
@@ -11,45 +11,89 @@
 
 
 class CartPoleCrashing(CartPoleEnv):
-    """A CartPole env that crashes from time to time.
+    """A CartPole env that crashes (or stalls) from time to time.
 
     Useful for testing faulty sub-env (within a vectorized env) handling by
-    RolloutWorkers.
+    EnvRunners.
 
     After crashing, the env expects a `reset()` call next (calling `step()` will
     result in yet another error), which may or may not take a very long time to
     complete. This simulates the env having to reinitialize some sub-processes, e.g.
     an external connection.
+
+    The env can also be configured to stall (and do nothing during a call to `step()`)
+    from time to time for a configurable amount of time.
     """
 
     def __init__(self, config=None):
         super().__init__()
 
-        config = config or {}
+        self.config = config if config is not None else {}
 
         # Crash probability (in each `step()`).
         self.p_crash = config.get("p_crash", 0.005)
+        # Crash probability when `reset()` is called.
         self.p_crash_reset = config.get("p_crash_reset", 0.0)
+        # Crash exactly after every n steps. If a 2-tuple, will uniformly sample
+        # crash timesteps from in between the two given values.
         self.crash_after_n_steps = config.get("crash_after_n_steps")
-        # Only crash (with prob=p_crash) if on certain worker indices.
+        self._crash_after_n_steps = None
+        assert (
+            self.crash_after_n_steps is None
+            or isinstance(self.crash_after_n_steps, int)
+            or (
+                isinstance(self.crash_after_n_steps, tuple)
+                and len(self.crash_after_n_steps) == 2
+            )
+        )
+        # Only ever crash, if on certain worker indices.
         faulty_indices = config.get("crash_on_worker_indices", None)
         if faulty_indices and config.worker_index not in faulty_indices:
             self.p_crash = 0.0
             self.p_crash_reset = 0.0
             self.crash_after_n_steps = None
+
+        # Stall probability (in each `step()`).
+        self.p_stall = config.get("p_stall", 0.0)
+        # Stall probability when `reset()` is called.
+        self.p_stall_reset = config.get("p_stall_reset", 0.0)
+        # Stall exactly after every n steps.
+        self.stall_after_n_steps = config.get("stall_after_n_steps")
+        self._stall_after_n_steps = None
+        # Amount of time to stall. If a 2-tuple, will uniformly sample from in between
+        # the two given values.
+        self.stall_time_sec = config.get("stall_time_sec")
+        assert (
+            self.stall_time_sec is None
+            or isinstance(self.stall_time_sec, (int, float))
+            or (
+                isinstance(self.stall_time_sec, tuple) and len(self.stall_time_sec) == 2
+            )
+        )
+
+        # Only ever stall, if on certain worker indices.
+        faulty_indices = config.get("stall_on_worker_indices", None)
+        if faulty_indices and config.worker_index not in faulty_indices:
+            self.p_stall = 0.0
+            self.p_stall_reset = 0.0
+            self.stall_after_n_steps = None
+
         # Timestep counter for the ongoing episode.
         self.timesteps = 0
 
         # Time in seconds to initialize (in this c'tor).
+        sample = 0.0
         if "init_time_s" in config:
-            init_time_s = config.get("init_time_s", 0)
-        else:
-            init_time_s = np.random.randint(
-                config.get("init_time_s_min", 0),
-                config.get("init_time_s_max", 1),
+            sample = (
+                config["init_time_s"]
+                if not isinstance(config["init_time_s"], tuple)
+                else np.random.uniform(
+                    config["init_time_s"][0], config["init_time_s"][1]
+                )
             )
-        print(f"Initializing crashing env with init-delay of {init_time_s}sec ...")
-        time.sleep(init_time_s)
+
+        print(f"Initializing crashing env (with init-delay of {sample}sec) ...")
+        time.sleep(sample)
 
         # No env pre-checking?
         self._skip_env_checking = config.get("skip_env_checking", False)
@@ -61,30 +105,81 @@ def __init__(self, config=None):
     def reset(self, *, seed=None, options=None):
         # Reset timestep counter for the new episode.
         self.timesteps = 0
+        self._crash_after_n_steps = None
+
         # Should we crash?
-        if self._rng.rand() < self.p_crash_reset or (
-            self.crash_after_n_steps is not None and self.crash_after_n_steps == 0
-        ):
+        if self._should_crash(p=self.p_crash_reset):
             raise EnvError(
-                "Simulated env crash in `reset()`! Feel free to use any "
-                "other exception type here instead."
+                f"Simulated env crash on worker={self.config.worker_index} "
+                f"env-idx={self.config.vector_index} during `reset()`! "
+                "Feel free to use any other exception type here instead."
             )
+        # Should we stall for a while?
+        self._stall_if_necessary(p=self.p_stall_reset)
+
         return super().reset()
 
     @override(CartPoleEnv)
     def step(self, action):
         # Increase timestep counter for the ongoing episode.
         self.timesteps += 1
+
         # Should we crash?
-        if self._rng.rand() < self.p_crash or (
-            self.crash_after_n_steps and self.crash_after_n_steps == self.timesteps
-        ):
+        if self._should_crash(p=self.p_crash):
             raise EnvError(
-                "Simulated env crash in `step()`! Feel free to use any "
-                "other exception type here instead."
+                f"Simulated env crash on worker={self.config.worker_index} "
+                f"env-idx={self.config.vector_index} during `step()`! "
+                "Feel free to use any other exception type here instead."
             )
-        # No crash.
+        # Should we stall for a while?
+        self._stall_if_necessary(p=self.p_stall)
+
         return super().step(action)
 
+    def _should_crash(self, p):
+        rnd = self._rng.rand()
+        if rnd < p:
+            print(f"Should crash! ({rnd} < {p})")
+            return True
+        elif self.crash_after_n_steps is not None:
+            if self._crash_after_n_steps is None:
+                self._crash_after_n_steps = (
+                    self.crash_after_n_steps
+                    if not isinstance(self.crash_after_n_steps, tuple)
+                    else np.random.randint(
+                        self.crash_after_n_steps[0], self.crash_after_n_steps[1]
+                    )
+                )
+            if self._crash_after_n_steps == self.timesteps:
+                print(f"Should crash! (after {self.timesteps} steps)")
+                return True
+
+        return False
+
+    def _stall_if_necessary(self, p):
+        stall = False
+        if self._rng.rand() < p:
+            stall = True
+        elif self.stall_after_n_steps is not None:
+            if self._stall_after_n_steps is None:
+                self._stall_after_n_steps = (
+                    self.stall_after_n_steps
+                    if not isinstance(self.stall_after_n_steps, tuple)
+                    else np.random.randint(
+                        self.stall_after_n_steps[0], self.stall_after_n_steps[1]
+                    )
+                )
+            if self._stall_after_n_steps == self.timesteps:
+                stall = True
+
+        if stall:
+            sec = (
+                self.stall_time_sec
+                if not isinstance(self.stall_time_sec, tuple)
+                else np.random.uniform(self.stall_time_sec[0], self.stall_time_sec[1])
+            )
+            print(f" -> will stall for {sec}sec ...")
+            time.sleep(sec)
+
 
 MultiAgentCartPoleCrashing = make_multi_agent(lambda config: CartPoleCrashing(config))
@@ -140,7 +140,12 @@ def __init__(
 
     @override(LearnerThread)
     def step(self) -> None:
-        assert self.loader_thread.is_alive()
+        if not self.loader_thread.is_alive():
+            raise RuntimeError(
+                "The `_MultiGPULoaderThread` has died! Will therefore also terminate "
+                "the `MultiGPULearnerThread`."
+            )
+
         with self.load_wait_timer:
             buffer_idx, released = self.ready_tower_stacks_buffer.get()
 
 
@@ -1033,9 +1033,8 @@ def _compute_actions_helper(
                     episodes=episodes,
                 )
             else:
+                # Try `action_distribution_fn`.
                 if is_overridden(self.action_distribution_fn):
-                    # Try new action_distribution_fn signature, supporting
-                    # state_batches and seq_lens.
                     (
                         dist_inputs,
                         self.dist_class,
 
@@ -0,0 +1,74 @@
+"""
+Tests, whether APPO can learn in a fault-tolerant fashion.
+
+Workers will be configured to automatically get recreated upon failures (here: within
+the environment).
+The environment we use here is configured to crash with a certain probability on each
+`step()` and/or `reset()` call. Additionally, the environment is configured to stall
+with a configured probability on each `step()` call for a certain amount of time.
+"""
+from ray.rllib.algorithms.appo import APPOConfig
+from ray.rllib.examples.env.cartpole_crashing import CartPoleCrashing
+from ray import tune
+
+tune.register_env("env", lambda cfg: CartPoleCrashing(cfg))
+
+
+stop = {
+    "evaluation/sampler_results/episode_reward_mean": 400.0,
+    "num_env_steps_sampled": 250000,
+}
+
+config = (
+    APPOConfig()
+    .environment(
+        "env",
+        env_config={
+            "p_crash": 0.0001,  # prob to crash during step()
+            "p_crash_reset": 0.001,  # prob to crash during reset()
+            "crash_on_worker_indices": [1, 2],
+            "init_time_s": 2.0,
+            "p_stall": 0.0005,  # prob to stall during step()
+            "p_stall_reset": 0.001,  # prob to stall during reset()
+            "stall_time_sec": (2, 5),  # stall between 2 and 10sec.
+            "stall_on_worker_indices": [2, 3],
+        },
+        # Disable env checking. Env checker doesn't handle Exceptions from
+        # user envs, and will crash rollout worker.
+        disable_env_checking=True,
+    )
+    .rollouts(
+        num_rollout_workers=1,
+        num_envs_per_worker=1,
+    )
+    # Switch on resiliency (recreate any failed worker).
+    .fault_tolerance(
+        recreate_failed_workers=True,
+    )
+    .evaluation(
+        evaluation_num_workers=1,
+        evaluation_interval=1,
+        evaluation_duration=25,
+        evaluation_duration_unit="episodes",
+        evaluation_parallel_to_training=True,
+        enable_async_evaluation=True,
+        evaluation_config=APPOConfig.overrides(
+            explore=False,
+            env_config={
+                # Make eval workers solid.
+                # This test is to prove that we can learn with crashing envs,
+                # not evaluate with crashing envs.
+                "p_crash": 0.0,
+                "p_crash_reset": 0.0,
+                "init_time_s": 0.0,
+                "p_stall": 0.0,
+                "p_stall_reset": 0.0,
+            },
+        ),
+    )
+)
+
+
+# algo = config.framework("tf2").build()
+# for _ in range(1000):
+#     print(algo.train())
@@ -0,0 +1,62 @@
+"""
+Tests, whether APPO can learn in a fault-tolerant fashion.
+
+Workers will be configured to automatically get recreated upon failures (here: within
+the environment).
+The environment we use here is configured to crash with a certain probability on each
+`step()` and/or `reset()` call.
+"""
+from ray.rllib.algorithms.appo import APPOConfig
+from ray.rllib.examples.env.cartpole_crashing import CartPoleCrashing
+from ray import tune
+
+tune.register_env("env", lambda cfg: CartPoleCrashing(cfg))
+
+
+stop = {
+    "evaluation/sampler_results/episode_reward_mean": 400.0,
+    "num_env_steps_sampled": 250000,
+}
+
+config = (
+    APPOConfig()
+    .environment(
+        "env",
+        env_config={
+            # Crash roughly every 500 ts.
+            "p_crash": 0.0005,  # prob to crash during step()
+            "p_crash_reset": 0.005,  # prob to crash during reset()
+            "crash_on_worker_indices": [1, 2],
+        },
+        # Disable env checking. Env checker doesn't handle Exceptions from
+        # user envs, and will crash rollout worker.
+        disable_env_checking=True,
+    )
+    .rollouts(
+        num_rollout_workers=3,
+        num_envs_per_worker=1,
+    )
+    # Switch on resiliency (recreate any failed worker).
+    .fault_tolerance(
+        recreate_failed_workers=True,
+    )
+    .evaluation(
+        evaluation_num_workers=1,
+        evaluation_interval=1,
+        evaluation_duration=25,
+        evaluation_duration_unit="episodes",
+        evaluation_parallel_to_training=True,
+        enable_async_evaluation=True,
+        evaluation_config=APPOConfig.overrides(
+            explore=False,
+            env_config={
+                # Make eval workers solid.
+                # This test is to prove that we can learn with crashing envs,
+                # not evaluate with crashing envs.
+                "p_crash": 0.0,
+                "p_crash_reset": 0.0,
+                "init_time_s": 0.0,
+            },
+        ),
+    )
+)
@@ -0,0 +1,70 @@
+"""
+Tests, whether APPO can learn in a fault-tolerant fashion in a
+multi-agent setting.
+
+Workers will be configured to automatically get recreated upon failures (here: within
+the environment).
+The environment we use here is configured to crash with a certain probability on each
+`step()` and/or `reset()` call.
+"""
+from ray.rllib.algorithms.appo import APPOConfig
+from ray.rllib.examples.env.cartpole_crashing import MultiAgentCartPoleCrashing
+from ray import tune
+
+tune.register_env("ma_env", lambda cfg: MultiAgentCartPoleCrashing(cfg))
+
+stop = {
+    "evaluation/sampler_results/episode_reward_mean": 800.0,
+    "num_env_steps_sampled": 250000,
+}
+
+config = (
+    APPOConfig()
+    .environment(
+        "ma_env",
+        env_config={
+            "num_agents": 2,
+            # Crash roughly every 300 ts. This should be ok to measure 180.0
+            # reward (episodes are 200 ts long).
+            "p_crash": 0.00005,  # prob to crash during step()
+            "p_crash_reset": 0.0005,  # prob to crash during reset()
+            "init_time_s": 2.0,
+            "p_stall": 0.001,  # prob to stall during step()
+            "p_stall_reset": 0.001,  # prob to stall during reset()
+            "stall_time_sec": (2, 5),  # stall between 2 and 10sec.
+            "stall_on_worker_indices": [2, 3],
+        },
+        # Disable env checking. Env checker doesn't handle Exceptions from
+        # user envs, and will crash rollout worker.
+        disable_env_checking=True,
+    )
+    .rollouts(
+        num_rollout_workers=3,
+        num_envs_per_worker=1,
+    )
+    # Switch on resiliency (recreate any failed worker).
+    .fault_tolerance(
+        recreate_failed_workers=True,
+    )
+    .evaluation(
+        evaluation_num_workers=1,
+        evaluation_interval=1,
+        evaluation_duration=25,
+        evaluation_duration_unit="episodes",
+        evaluation_parallel_to_training=True,
+        enable_async_evaluation=True,
+        evaluation_config=APPOConfig.overrides(
+            explore=False,
+            env_config={
+                # Make eval workers solid.
+                # This test is to prove that we can learn with crashing envs,
+                # not evaluate with crashing envs.
+                "p_crash": 0.0,
+                "p_crash_reset": 0.0,
+                "init_time_s": 0.0,
+                "p_stall": 0.0,
+                "p_stall_reset": 0.0,
+            },
+        ),
+    )
+)
@@ -0,0 +1,63 @@
+"""
+Tests, whether APPO can learn in a fault-tolerant fashion in a
+multi-agent setting.
+
+Workers will be configured to automatically get recreated upon failures (here: within
+the environment).
+The environment we use here is configured to crash with a certain probability on each
+`step()` and/or `reset()` call.
+"""
+from ray.rllib.algorithms.appo import APPOConfig
+from ray.rllib.examples.env.cartpole_crashing import MultiAgentCartPoleCrashing
+from ray import tune
+
+tune.register_env("ma_env", lambda cfg: MultiAgentCartPoleCrashing(cfg))
+
+stop = {
+    "evaluation/sampler_results/episode_reward_mean": 800.0,
+    "num_env_steps_sampled": 250000,
+}
+
+config = (
+    APPOConfig()
+    .environment(
+        "ma_env",
+        env_config={
+            "num_agents": 2,
+            # Crash roughly every 300 ts. This should be ok to measure 180.0
+            # reward (episodes are 200 ts long).
+            "p_crash": 0.0005,  # prob to crash during step()
+            "p_crash_reset": 0.005,  # prob to crash during reset()
+        },
+        # Disable env checking. Env checker doesn't handle Exceptions from
+        # user envs, and will crash rollout worker.
+        disable_env_checking=True,
+    )
+    .rollouts(
+        num_rollout_workers=4,
+        num_envs_per_worker=1,
+    )
+    # Switch on resiliency (recreate any failed worker).
+    .fault_tolerance(
+        recreate_failed_workers=True,
+    )
+    .evaluation(
+        evaluation_num_workers=1,
+        evaluation_interval=1,
+        evaluation_duration=25,
+        evaluation_duration_unit="episodes",
+        evaluation_parallel_to_training=True,
+        enable_async_evaluation=True,
+        evaluation_config=APPOConfig.overrides(
+            explore=False,
+            env_config={
+                # Make eval workers solid.
+                # This test is to prove that we can learn with crashing envs,
+                # not evaluate with crashing envs.
+                "p_crash": 0.0,
+                "p_crash_reset": 0.0,
+                "init_time_s": 0.0,
+            },
+        ),
+    )
+)
@@ -179,7 +179,7 @@ def apply(
         except Exception as e:
             # Actor should be recreated by Ray.
             if self.config.recreate_failed_workers:
-                logger.exception("Worker exception, recreating: {}".format(e))
+                logger.exception(f"Worker exception caught during `apply()`: {e}")
                 # Small delay to allow logs messages to propagate.
                 time.sleep(self.config.delay_between_worker_restarts_s)
                 # Kill this worker so Ray Core can restart it.
@@ -260,6 +260,7 @@ def __init__(
         # Actors are stored in a map and indexed by a unique id.
         self.__actors: Mapping[int, ActorHandle] = {}
         self.__remote_actor_states: Mapping[int, self._ActorState] = {}
+        self.__restored_actors = set()
         self.add_actors(actors or [])
 
         # Maps outstanding async requests to the ids of the actors that
@@ -328,6 +329,7 @@ def remove_actor(self, actor_id: int) -> ActorHandle:
         # Remove the actor from the pool.
         del self.__actors[actor_id]
         del self.__remote_actor_states[actor_id]
+        self.__restored_actors.discard(actor_id)
         self._remove_async_state(actor_id)
 
         return actor
@@ -376,6 +378,15 @@ def set_actor_state(self, actor_id: int, healthy: bool) -> None:
         """
         if actor_id not in self.__remote_actor_states:
             raise ValueError(f"Unknown actor id: {actor_id}")
+
+        was_healthy = self.__remote_actor_states[actor_id].is_healthy
+        # Set from unhealthy to healthy -> Add to restored set.
+        if not was_healthy and healthy:
+            self.__restored_actors.add(actor_id)
+        # Set from healthy to unhealthy -> Remove from restored set.
+        elif was_healthy and not healthy:
+            self.__restored_actors.discard(actor_id)
+
         self.__remote_actor_states[actor_id].is_healthy = healthy
 
         if not healthy:
@@ -389,6 +400,7 @@ def clear(self):
             ray.kill(actor)
         self.__actors.clear()
         self.__remote_actor_states.clear()
+        self.__restored_actors.clear()
         self.__in_flight_req_to_actor_id.clear()
 
     def __call_actors(
@@ -487,8 +499,9 @@ def __fetch_result(
                 result = ray.get(r)
                 remote_results.add_result(actor_id, ResultOrError(result=result), tag)
 
+                # Actor came back from an unhealthy state. Mark this actor as healthy
+                # and add it to our restored set.
                 if mark_healthy and not self.is_actor_healthy(actor_id):
-                    # Yay, mark this actor as healthy.
                     logger.info(f"brining actor {actor_id} back into service.")
                     self.set_actor_state(actor_id, healthy=True)
                     self._num_actor_restarts += 1
@@ -498,7 +511,7 @@ def __fetch_result(
 
                 # Mark the actor as unhealthy.
                 # TODO(jungong): Using RayError here to preserve historical behavior.
-                # It may very likely be better to use RayActorError here.
+                #  It may very likely be better to use RayActorError here.
                 if isinstance(e, RayError):
                     # Take this actor out of service and wait for Ray Core to
                     # restore it.
@@ -790,17 +803,27 @@ def probe_unhealthy_actors(
             mark_healthy: Whether to mark actors healthy if they respond to the ping.
 
         Returns:
-            A list of actor ids that are restored.
+            A list of actor IDs that were restored by the `ping` AND those actors that
+            were previously restored via other remote requests. The cached set of
+            such previously restored actors will be erased in this call.
         """
+        # Collect recently restored actors (from `self.__fetch_result` calls other than
+        # the one triggered here via the `ping`).
+        restored_actors = list(self.__restored_actors)
+        self.__restored_actors.clear()
+
+        # Probe all unhealthy actors via a simple `ping()`.
         unhealthy_actor_ids = [
             actor_id
             for actor_id in self.actor_ids()
             if not self.is_actor_healthy(actor_id)
         ]
+        # No unhealthy actors currently -> Return recently restored ones.
         if not unhealthy_actor_ids:
-            # Great, nothing to do.
-            return []
+            return restored_actors
 
+        # Some unhealthy actors -> `ping()` all of them to trigger a new fetch and
+        # capture all restored ones.
         remote_results = self.foreach_actor(
             func=lambda actor: actor.ping(),
             remote_actor_ids=unhealthy_actor_ids,
@@ -809,7 +832,10 @@ def probe_unhealthy_actors(
             mark_healthy=mark_healthy,
         )
 
-        return [result.actor_id for result in remote_results if result.ok]
+        # Return previously restored actors AND actors restored via the `ping()` call.
+        return restored_actors + [
+            result.actor_id for result in remote_results if result.ok
+        ]
 
     def actors(self):
         # TODO(jungong) : remove this API once WorkerSet.remote_workers()
Original file line number	Diff line number	Diff line change
`@@ -1033,9 +1033,8 @@ def _compute_actions_helper(`
`1033`	`1033`	`episodes=episodes,`
`1034`	`1034`	`)`
`1035`	`1035`	`else:`
	`1036`	+ # Try `action_distribution_fn`.
`1036`	`1037`	`if is_overridden(self.action_distribution_fn):`
`1037`		`- # Try new action_distribution_fn signature, supporting`
`1038`		`- # state_batches and seq_lens.`
`1039`	`1038`	`(`
`1040`	`1039`	`dist_inputs,`
`1041`	`1040`	`self.dist_class,`