Destroy CUDA events after profiling (pytorch#39962)

Ilia Cherniavskii · facebook-github-bot · commit d8c384544e94 · 2020-06-23T10:44:39.000-07:00
Summary: Pull Request resolved: pytorch#39962 Adding a simple wrapper with ref count for cuda event and destroying cuda event after the last copy is destroyed Test Plan: CI cuda profiler tests Differential Revision: D22027092 Pulled By: ilia-cher fbshipit-source-id: e0810388aa60b2291eb010896e13af1fad92e472
diff --git a/benchmarks/profiler_benchmark/profiler_bench.py b/benchmarks/profiler_benchmark/profiler_bench.py
@@ -1,10 +1,9 @@
+from functools import partial
 import itertools
 import statistics
 import timeit
 import torch
 
-profiling_enabled = None
-profiling_tensor_size = None
 TENSOR_SIZES = [1, 32, 128, 256, 512]
 INTERNAL_ITER = 256
 PARALLEL_TASKS_NUM = 4
@@ -16,13 +15,12 @@ def loop_workload(x):
     return x
 
 traced_loop_workload = None
-def run_profiler_benchmark_loop():
-    x = torch.rand(profiling_tensor_size, profiling_tensor_size)
+def run_profiler_benchmark_loop(input_x, use_cuda, profiling_enabled):
     if profiling_enabled:
-        with torch.autograd.profiler.profile() as prof:
-            traced_loop_workload(x)
+        with torch.autograd.profiler.profile(use_cuda=use_cuda) as prof:
+            traced_loop_workload(input_x)
     else:
-        traced_loop_workload(x)
+        traced_loop_workload(input_x)
 
 def parallel_task(x):
     for i in range(int(INTERNAL_ITER / PARALLEL_TASKS_NUM)):
@@ -38,40 +36,49 @@ def parallel_workload(x):
     return x
 
 traced_parallel_workload = None
-def run_profiler_benchmark_parallel():
-    x = torch.rand(profiling_tensor_size, profiling_tensor_size)
+def run_profiler_benchmark_parallel(input_x, use_cuda, profiling_enabled):
     if profiling_enabled:
-        with torch.autograd.profiler.profile() as prof:
-            traced_parallel_workload(x)
+        with torch.autograd.profiler.profile(use_cuda=use_cuda) as prof:
+            traced_parallel_workload(input_x)
     else:
-        traced_parallel_workload(x)
+        traced_parallel_workload(input_x)
 
 if __name__ == '__main__':
     for workload_name in ["loop", "parallel"]:
         print("Payload: {}; {} iterations, N = {}\n".format(
             workload_name, INTERNAL_ITER, N))
-        for params in itertools.product(TENSOR_SIZES, [False, True]):
-            profiling_tensor_size = params[0]
-            profiling_enabled = params[1]
+        for params in itertools.product([False, True], TENSOR_SIZES, [False, True]):
+            use_cuda = params[0]
+            profiling_tensor_size = params[1]
+            profiling_enabled = params[2]
 
-            print("Profiling {}, tensor size {}x{}".format(
-                "enabled " if profiling_enabled else "disabled",
-                profiling_tensor_size, profiling_tensor_size))
+            if (use_cuda and not torch.cuda.is_available()):
+                continue
 
-            x = torch.rand(profiling_tensor_size, profiling_tensor_size)
+            print("Profiling {}, tensor size {}x{}, use cuda: {}".format(
+                "enabled" if profiling_enabled else "disabled",
+                profiling_tensor_size, profiling_tensor_size, use_cuda))
+
+            input_x = torch.rand(profiling_tensor_size, profiling_tensor_size)
+            if use_cuda:
+                input_x = input_x.cuda()
             workload = None
             if workload_name == "loop":
-                workload = run_profiler_benchmark_loop
-                traced_loop_workload = torch.jit.trace(loop_workload, x)
+                workload = partial(
+                    run_profiler_benchmark_loop, input_x, use_cuda, profiling_enabled)
+                traced_loop_workload = torch.jit.trace(loop_workload, input_x)
             elif workload_name == "parallel":
-                workload = run_profiler_benchmark_parallel
+                workload = partial(
+                    run_profiler_benchmark_parallel, input_x, use_cuda, profiling_enabled)
                 traced_parallel_workload = torch.jit.trace(
-                    parallel_workload, x)
+                    parallel_workload, input_x)
 
             runtimes = timeit.repeat(workload, repeat=N, number=1)
             avg_time = statistics.mean(runtimes) * 1000.0
             stddev_time = statistics.stdev(runtimes) * 1000.0
             print("\tavg. time: {:.3f} ms, stddev: {:.3f} ms".format(
                 avg_time, stddev_time))
-            print("\ttime per iteration: {:.3f} ms\n".format(
-                avg_time / INTERNAL_ITER))
+            if workload_name == "loop":
+                print("\ttime per iteration: {:.3f} ms".format(
+                    avg_time / INTERNAL_ITER))
+            print()
diff --git a/test/run_test.py b/test/run_test.py
@@ -70,6 +70,7 @@
     'test_jit_fuser_te',
     'test_tensorexpr',
     'test_openmp',
+    'test_profiler',
     'distributed/nn/jit/test_instantiator',
     'distributed/nn/api/test_remote_module_spawn',
     'distributed/rpc/faulty_agent/test_dist_autograd_spawn',
diff --git a/test/test_profiler.py b/test/test_profiler.py
@@ -0,0 +1,45 @@
+import collections
+import gc
+import unittest
+
+import torch
+from torch.testing._internal.common_utils import (
+    TestCase, run_tests, TEST_WITH_ASAN)
+from torch.autograd.profiler import profile
+
+try:
+    import psutil
+    HAS_PSUTIL = True
+except ImportError:
+    HAS_PSUTIL = False
+
+
+@unittest.skipIf(not HAS_PSUTIL, "Requires psutil to run")
+@unittest.skipIf(TEST_WITH_ASAN, "Cannot test with ASAN")
+@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
+class TestProfiler_cuda(TestCase):
+    def test_mem_leak(self):
+        """Checks that there's no memory leak when using profiler with CUDA
+        """
+        t = torch.rand(1, 1).cuda()
+        p = psutil.Process()
+        last_rss = collections.deque(maxlen=5)
+        for outer_idx in range(10):
+            with profile(use_cuda=True):
+                for _ in range(1024):
+                    t = torch.mm(t, t)
+
+            gc.collect()
+            torch.cuda.empty_cache()
+            last_rss.append(p.memory_info().rss)
+
+        max_diff = -1
+        for idx in range(1, len(last_rss)):
+            max_diff = max(max_diff, last_rss[idx] - last_rss[idx - 1])
+
+        # with CUDA events leaking the increase in memory was ~7 MB,
+        # using much smaller threshold but not zero to reduce flakiness
+        self.assertTrue(max_diff < 100 * 1024)
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
@@ -264,7 +264,7 @@ struct ProfilerThreadLocalState
           thread_id,
           config_.state == ProfilerState::CUDA);
       evt.updateMemoryStats(alloc_size, device);
-      getEventList(thread_id).record(evt);
+      getEventList(thread_id).record(std::move(evt));
     }
   }
 
@@ -554,7 +554,7 @@ at::IValue Event::toIValue() const {
   return at::IValue(eventIValueList);
 }
 
-double Event::cuda_elapsed_us(const Event & e) const {
+double Event::cuda_elapsed_us(const Event& e) const {
   TORCH_CHECK(e.has_cuda() && has_cuda(), "Events were not recorded for CUDA");
   TORCH_CHECK(
       e.device() == device(),
@@ -565,7 +565,7 @@ double Event::cuda_elapsed_us(const Event & e) const {
     TORCH_INTERNAL_ASSERT(cuda_us_ >= 0 && e.cuda_us_ >= 0);
     return static_cast<double>(e.cuda_us_ - cuda_us_);
   }
-  return cuda_stubs->elapsed(cuda_event, e.cuda_event);
+  return cuda_stubs->elapsed(&cuda_event, &e.cuda_event);
 }
 
 CUDAStubs::~CUDAStubs() = default;
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
@@ -20,7 +20,8 @@
 
 #include <ATen/record_function.h>
 
-typedef struct CUevent_st* CUDAEventStub;
+struct CUevent_st;
+typedef std::shared_ptr<CUevent_st> CUDAEventStub;
 
 namespace torch { namespace autograd {
 
@@ -32,7 +33,7 @@ struct TORCH_API CUDAStubs {
   virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) {
     fail();
   }
-  virtual float elapsed(CUDAEventStub event, CUDAEventStub event2) {
+  virtual float elapsed(const CUDAEventStub* event, const CUDAEventStub* event2) {
     fail();
     return 0.f;
   }
@@ -291,7 +292,7 @@ struct TORCH_API Event final {
   int64_t cpu_memory_usage_ = 0;
   int64_t cuda_memory_usage_ = 0;
   int device_ = -1;
-  struct CUevent_st* cuda_event = nullptr;
+  CUDAEventStub cuda_event = nullptr;
   int node_id_ = 0;
   bool is_remote_ = false;
   int64_t cuda_us_ = -1;
diff --git a/torch/csrc/autograd/profiler_cuda.cpp b/torch/csrc/autograd/profiler_cuda.cpp
@@ -34,16 +34,20 @@ static inline void cudaCheck(cudaError_t result, const char * file, int line) {
 struct CUDAMethods : public CUDAStubs {
   void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) override {
     TORCH_CUDA_CHECK(cudaGetDevice(device));
-    TORCH_CUDA_CHECK(cudaEventCreate(event));
+    CUevent_st* cuda_event_ptr;
+    TORCH_CUDA_CHECK(cudaEventCreate(&cuda_event_ptr));
+    *event = std::shared_ptr<CUevent_st>(cuda_event_ptr, [](CUevent_st* ptr) {
+      TORCH_CUDA_CHECK(cudaEventDestroy(ptr));
+    });
     auto stream = at::cuda::getCurrentCUDAStream();
     *cpu_ns = getTime();
-    TORCH_CUDA_CHECK(cudaEventRecord(*event, stream));
+    TORCH_CUDA_CHECK(cudaEventRecord(cuda_event_ptr, stream));
   }
-  float elapsed(CUDAEventStub event, CUDAEventStub event2) override {
-    TORCH_CUDA_CHECK(cudaEventSynchronize(event));
-    TORCH_CUDA_CHECK(cudaEventSynchronize(event2));
+  float elapsed(const CUDAEventStub* event, const CUDAEventStub* event2) override {
+    TORCH_CUDA_CHECK(cudaEventSynchronize(event->get()));
+    TORCH_CUDA_CHECK(cudaEventSynchronize(event2->get()));
     float ms;
-    TORCH_CUDA_CHECK(cudaEventElapsedTime(&ms, event, event2));
+    TORCH_CUDA_CHECK(cudaEventElapsedTime(&ms, event->get(), event2->get()));
     return ms*1000.0;
   }
   void nvtxMarkA(const char* name) override {

Original file line number	Diff line number	Diff line change
`@@ -264,7 +264,7 @@ struct ProfilerThreadLocalState`
`264`	`264`	`thread_id,`
`265`	`265`	`config_.state == ProfilerState::CUDA);`
`266`	`266`	`evt.updateMemoryStats(alloc_size, device);`
`267`		`- getEventList(thread_id).record(evt);`
	`267`	`+ getEventList(thread_id).record(std::move(evt));`
`268`	`268`	`}`
`269`	`269`	`}`
`270`	`270`
`@@ -554,7 +554,7 @@ at::IValue Event::toIValue() const {`
`554`	`554`	`return at::IValue(eventIValueList);`
`555`	`555`	`}`
`556`	`556`
`557`		`-double Event::cuda_elapsed_us(const Event & e) const {`
	`557`	`+double Event::cuda_elapsed_us(const Event& e) const {`
`558`	`558`	`TORCH_CHECK(e.has_cuda() && has_cuda(), "Events were not recorded for CUDA");`
`559`	`559`	`TORCH_CHECK(`
`560`	`560`	`e.device() == device(),`
`@@ -565,7 +565,7 @@ double Event::cuda_elapsed_us(const Event & e) const {`
`565`	`565`	`TORCH_INTERNAL_ASSERT(cuda_us_ >= 0 && e.cuda_us_ >= 0);`
`566`	`566`	`return static_cast<double>(e.cuda_us_ - cuda_us_);`
`567`	`567`	`}`
`568`		`- return cuda_stubs->elapsed(cuda_event, e.cuda_event);`
	`568`	`+ return cuda_stubs->elapsed(&cuda_event, &e.cuda_event);`
`569`	`569`	`}`
`570`	`570`
`571`	`571`	`CUDAStubs::~CUDAStubs() = default;`