Oneflow-Inc · mergify · Mar 22, 2023 · Mar 21, 2023 · Mar 21, 2023 · Mar 22, 2023
diff --git a/python/oneflow/nn/graph/cache.py b/python/oneflow/nn/graph/cache.py
@@ -26,48 +26,52 @@
 class LRUCache(object):
     _cnt: int = 0
 
-    def __init__(self, cache_size):
+    def __init__(self, cache_size, keep_the_1st=True):
+        assert cache_size >= 2
         self.cache_size = cache_size
-        self.queue = deque()
         self.hash_map = dict()
-
-    def front(self):
-        if self.is_empty():
-            return None
-
-        key = self.queue[0]
-        return self.hash_map[key]
+        self.keep_the_1st = keep_the_1st
+        self.queue = deque()
 
     def is_empty(self):
-        return len(self.queue) == 0
+        return len(self.hash_map) == 0
 
-    def is_queue_full(self):
-        return len(self.queue) >= self.cache_size
+    def is_full(self):
+        return len(self.hash_map) >= self.cache_size
 
     def pop(self):
+        if len(self.queue) == 0:
+            return None
         pop_key = self.queue.pop()
         value = self.hash_map.pop(pop_key)
         del value
         return pop_key
 
     def set(self, key, value):
+        new_key = None
+        old_key = None
         if key in self.hash_map:
-            return None
+            return new_key, old_key
 
-        pop_key = None
-        while self.is_queue_full():
-            pop_key = self.pop()
+        if self.is_full():
+            old_key = self.pop()
+            assert old_key is not None, f"Cache size is {self.cache_size}, at least 2."
+        assert not self.is_full()
+
+        if not (self.keep_the_1st and self.is_empty()):
+            self.queue.appendleft(key)
 
-        self.queue.appendleft(key)
         value._oneflow_graph_cache_order = LRUCache._cnt
         LRUCache._cnt += 1
         self.hash_map[key] = value
-        return pop_key if pop_key is not None else key
+        new_key = key
+        return new_key, old_key
 
     def get(self, key):
         if key in self.hash_map:
-            self.queue.remove(key)
-            self.queue.appendleft(key)
+            if key in self.queue:
+                self.queue.remove(key)
+                self.queue.appendleft(key)
             return self.hash_map[key]
 
         return None
@@ -111,21 +115,27 @@ def __call__(self, *args, **kwargs):
             return graph(*args, **kwargs)
 
     def runtime_state_dict(
-        self, destination=None
+        self, destination=None, with_eager=False,
     ) -> Dict[str, Dict[str, Union[Dict[str, Tensor], str]]]:
         if destination is None:
             destination = OrderedDict()
             destination._metadata = OrderedDict()
 
         for (key, graph) in self._cache.items():
             with AvoidRecursiveCacheCall(graph):
-                state_dict = graph.runtime_state_dict()
+                state_dict = graph.runtime_state_dict(with_eager=with_eager)
             state_dict["cache_order"] = graph._oneflow_graph_cache_order
             state_dict["cache_key"] = key
             destination[state_dict["graph_name"]] = state_dict
         return destination
 
     def _init_and_get_a_graph_in_cache(self, cache_key):
+        self._base_graph._print(
+            0,
+            0,
+            self._base_graph._shallow_repr()
+            + f" is creating a graph cache with key {cache_key}.",
+        )
         cur_is_base = False
         if self._cache.is_empty():
             # Has no graph yet
@@ -135,7 +145,7 @@ def _init_and_get_a_graph_in_cache(self, cache_key):
             # Create new graph from base
             graph = self._base_graph.__class__(
                 *self._base_graph._cached_init_args,
-                **self._base_graph._cached_init_kwargs
+                **self._base_graph._cached_init_kwargs,
             )
             graph._run_with_cache = False
             graph._dynamic_input_graph_cache = None
@@ -147,8 +157,16 @@ def _init_and_get_a_graph_in_cache(self, cache_key):
                 graph.enable_shared()
             else:
                 graph.share_from(self._base_graph)
-        ret = self._cache.set(cache_key, graph)
-        assert ret is not None
+        new_key, old_key = self._cache.set(cache_key, graph)
+        if old_key is not None:
+            self._base_graph._print(
+                0,
+                0,
+                self._base_graph._shallow_repr()
+                + f" cache is full(cache size {self._cache_size}), has deleted an old graph cache with key {old_key}.",
+            )
+        assert new_key is not None
+
         return graph
 
     def load_runtime_state_dict(
@@ -159,9 +177,12 @@ def load_runtime_state_dict(
             cache_order = sub_state_dict["cache_order"]
             graph_dict[cache_order] = sub_state_dict
 
-        self._cache = LRUCache(self._cache_size)
+        if self._cache is None:
+            self._cache = LRUCache(self._cache_size)
         for _, sub_state_dict in sorted(graph_dict.items()):
             cache_key = sub_state_dict["cache_key"]
+            graph = self._cache.get(cache_key)
+            assert graph is None
             graph = self._init_and_get_a_graph_in_cache(cache_key)
             with AvoidRecursiveCacheCall(graph):
                 graph.load_runtime_state_dict(sub_state_dict)
@@ -183,12 +204,12 @@ def get_graph(self, *args, **kwargs):
 
         # Create graph
         if graph is None:
-            graph = self._init_and_get_a_graph_in_cache(cache_key)
             self._base_graph._print(
                 0,
                 0,
                 self._base_graph._shallow_repr()
                 + " got a new input shape, is compiling a new graph.",
             )
+            graph = self._init_and_get_a_graph_in_cache(cache_key)
 
         return graph
diff --git a/python/oneflow/nn/graph/graph.py b/python/oneflow/nn/graph/graph.py
@@ -730,6 +730,7 @@ def _filter_states(self):
                 self._variables_conf[state_tensor] = VariableConfig(op_name)
 
         self._state_tensor_tuple = convert_to_tensor_tuple(state_tensors)
+        self._eager_state_op_names = deepcopy(state_op_names)
         return state_op_names
 
     def _generate_config_proto(self):
@@ -1011,13 +1012,15 @@ def enable_save_runtime_state_dict(self, mode: bool = True):
             self._enable_save_runtime_state_dict = False
 
     def runtime_state_dict(
-        self, destination=None
+        self, destination=None, with_eager=False
     ) -> Union[
         Dict[str, Union[Dict[str, Tensor], str]],
         Dict[str, Dict[str, Union[Dict[str, Tensor], str]]],
     ]:
         if self._run_with_cache == True:
-            return self._dynamic_input_graph_cache.runtime_state_dict()
+            return self._dynamic_input_graph_cache.runtime_state_dict(
+                with_eager=with_eager
+            )
 
         assert (
             self._enable_save_runtime_state_dict
@@ -1067,10 +1070,28 @@ def gen_index_in_tuple(eager_out):
         )
         destination["outputs"] = outputs_sub_destination
 
+        destination["oneflow_with_eager_tensor"] = with_eager
         if not self._build_with_shared_graph:
+            _state_tensor_tuple4save = []
+            if with_eager:
+                _state_tensor_tuple4save = self._state_tensor_tuple
+            else:
+                assert len(self._state_tensor_tuple) == len(self._state_op_names)
+                for state_idx in range(len(self._state_tensor_tuple)):
+                    if self._state_op_names[state_idx] in self._eager_state_op_names:
+                        # This state tensor is from eager module. Just save a dummy tensor here.
+                        _state_tensor_tuple4save.append(
+                            oneflow.Tensor().to(
+                                self._state_tensor_tuple[state_idx].device
+                            )
+                        )
+                    else:
+                        _state_tensor_tuple4save.append(
+                            self._state_tensor_tuple[state_idx]
+                        )
             states_sub_destination = OrderedDict()
             _fill_sub_destination(
-                states_sub_destination, self._state_op_names, self._state_tensor_tuple
+                states_sub_destination, self._state_op_names, _state_tensor_tuple4save
             )
             destination["states"] = states_sub_destination
 
@@ -1140,6 +1161,13 @@ def get_tensor_in_tuple(map_item):
             get_tensor_in_tuple, *_eager_outputs_index
         )
         self._eager_outputs = _eager_outputs
+
+        # Load state tensor of modules
+        if "oneflow_with_eager_tensor" in state_dict:
+            with_eager = state_dict["oneflow_with_eager_tensor"]
+        else:
+            with_eager = True
+
         if self._build_with_shared_graph:
             self._state_op_names = self._shared_graph._state_op_names
             self._state_tensor_tuple = self._shared_graph._state_tensor_tuple
@@ -1160,10 +1188,14 @@ def get_tensor_in_tuple(map_item):
                 for s_idx, s_name in enumerate(self._state_op_names):
                     if s_name in states_from_eager:
                         state_tensor_from_eager = states_from_eager[s_name]
-                        # Note: compare value has extra cost.
-                        assert oneflow.allclose(
-                            state_tensor_from_eager, self._state_tensor_tuple[s_idx]
+                        assert (
+                            state_tensor_from_eager.device
+                            == self._state_tensor_tuple[s_idx].device
                         )
+                        if with_eager:
+                            assert oneflow.allclose(
+                                state_tensor_from_eager, self._state_tensor_tuple[s_idx]
+                            )
                         self._state_tensor_tuple[s_idx] = state_tensor_from_eager
 
         self.__build_outputs_buffer()