Sparse softmax support (CUDA) (pytorch#42307)

aocsa · facebook-github-bot · commit 29dc3c5ec821 · 2020-09-24T00:07:30.000-07:00
Summary: This PR implements softmax support for sparse tensors. Resolves pytorchgh-23651 for CUDA. - [x] sparse softmax - [x] CUDA C++ implementation - [x] unittests - [x] update softmax documentation - [x] autograd support - [x] sparse log_softmax - [x] CUDA C++ implementation - [x] unittests - [x] update log_softmax documentation - [x] autograd support Here are some benchmark (script is [here](https://gist.github.com/aocsa/fbc1827b3e49901512a33ba96092cbc1)) results for `torch.sparse.softmax and torch.softmax`, using CPU and GPU, values are float64 scalars, timing repeat is 1000: | size | density | sparse CUDA | sparse CPU | |--------------|---------|-------------|------------| | (32, 10000) | 0.01 | 380.2 | 687.5 | | (32, 10000) | 0.05 | 404.3 | 2357.9 | | (32, 10000) | 0.1 | 405.9 | 3677.2 | | (512, 10000) | 0.01 | 438.0 | 5443.4 | | (512, 10000) | 0.05 | 888.1 | 24485.0 | | (512, 10000) | 0.1 | 1921.3 | 45340.5 | | size | density | dense CUDA | dense CPU | |--------------|---------|-------------|------------| | (32, 10000) | 0.01 | 23.6 | 1943.2 | | (32, 10000) | 0.05 | 23.6 | 1954.0 | | (32, 10000) | 0.1 | 23.5 | 1950.0 | | (512, 10000) | 0.01 | 639.3 | 39797.9 | | (512, 10000) | 0.05 | 640.3 | 39374.4 | | (512, 10000) | 0.1 | 639.6 | 39192.3 | Times are in microseconds (us). Quick note: I updated the performance test again. Pull Request resolved: pytorch#42307 Reviewed By: ngimel Differential Revision: D23774427 Pulled By: mruberry fbshipit-source-id: bfabf726075b39dde544c10249f27ae1871f82c7
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -3676,11 +3676,13 @@
   use_c10_dispatcher: full
   dispatch:
     SparseCPU: softmax_sparse_cpu
+    SparseCUDA: softmax_sparse_cuda
 
 - func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
   use_c10_dispatcher: full
   dispatch:
     SparseCPU: softmax_backward_sparse_cpu
+    SparseCUDA: softmax_backward_sparse_cuda
 
 - func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
@@ -3693,11 +3695,13 @@
   use_c10_dispatcher: full
   dispatch:
     SparseCPU: log_softmax_sparse_cpu
+    SparseCUDA: log_softmax_sparse_cuda
 
 - func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
   use_c10_dispatcher: full
   dispatch:
     SparseCPU: log_softmax_backward_sparse_cpu
+    SparseCUDA: log_softmax_backward_sparse_cuda
 
 - func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
   use_c10_dispatcher: full
diff --git a/aten/src/ATen/native/sparse/ParamUtils.cpp b/aten/src/ATen/native/sparse/ParamUtils.cpp
@@ -0,0 +1,53 @@
+#include <ATen/native/sparse/ParamUtils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/ATen.h>
+#include <tuple>
+
+namespace at {
+namespace native {
+
+std::pair<Tensor, Tensor> softmax_sparse_input_preprocessing(
+    const Tensor& input_,
+    const int64_t dim_,
+    const bool half_to_float,
+    CheckedFrom function_name) {
+  TORCH_INTERNAL_ASSERT(input_.is_sparse());
+  TORCH_CHECK(
+      !half_to_float,
+      std::string(function_name) +
+          ": with half to float conversion is not supported on " +
+          input_.device().str());
+  auto input = input_.coalesce();
+  Tensor output = at::native::empty_like(input);
+  TORCH_CHECK(
+      dim_ >= 0 && dim_ < input.dim(),
+      ": dim must be non-negative and less than input dimensions");
+  return std::make_pair(input, output);
+}
+
+std::tuple<Tensor, Tensor, Tensor> softmax_backward_sparse_input_preprocessing(
+    const Tensor& grad_,
+    const Tensor& output_,
+    int64_t dim_,
+    const Tensor& input_,
+    CheckedFrom function_name) {
+  TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2};
+  checkSameSize(function_name, grad_arg, output_arg);
+
+  int64_t dim = maybe_wrap_dim(dim_, grad_.dim());
+
+  auto grad = grad_.coalesce();
+  auto output = output_.coalesce();
+
+  Tensor grad_input = at::native::empty_like(output);
+  TORCH_CHECK(
+      dim >= 0 && dim < grad.dim(),
+      ": dim must be non-negative and less than input dimensions");
+  TORCH_CHECK(
+      grad.sparse_dim() == output.sparse_dim(),
+      ": grad and output sparse dimensions must be equal");
+  return std::make_tuple(grad_input, grad, output);
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/sparse/ParamUtils.h b/aten/src/ATen/native/sparse/ParamUtils.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+#include <tuple>
+
+namespace at {
+namespace native {
+
+TORCH_API std::pair<Tensor, Tensor> softmax_sparse_input_preprocessing(
+    const Tensor& input_,
+    const int64_t dim_,
+    const bool half_to_float,
+    CheckedFrom function_name);
+
+TORCH_API std::tuple<Tensor, Tensor, Tensor> softmax_backward_sparse_input_preprocessing(
+    const Tensor& grad_,
+    const Tensor& output_,
+    int64_t dim_,
+    const Tensor& input_,
+    CheckedFrom function_name);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/sparse/SoftMax.cpp b/aten/src/ATen/native/sparse/SoftMax.cpp
@@ -4,6 +4,7 @@
 #include <ATen/SparseTensorUtils.h>
 #include <ATen/Parallel.h>
 #include <ATen/NamedTensorUtils.h>
+#include <ATen/native/sparse/ParamUtils.h>
 #include <map>
 
 namespace at {
@@ -291,10 +292,10 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
   if (dim >= sparse_dim) {
     if (LogSoftMax) {
       auto new_values = log_softmax_cpu(values, dim - sparse_dim + 1, false);
-      out_values.copy_(new_values);
+      out_values.set_(new_values);
     } else {
       auto new_values = softmax_cpu(values, dim - sparse_dim + 1, false);
-      out_values.copy_(new_values);
+      out_values.set_(new_values);
     }
     return;
   }
@@ -411,17 +412,27 @@ void cpu_sparse_coo_softmax_backward(Tensor& grad_input, const Tensor& grad, con
   auto grad_offsets = get_offsets(grad_indices, sizes, -1);
 
   if (dim >= sparse_dim) {
-    for(int64_t i=0; i<out_nnz; i++) {
-      Tensor unused;
-      auto low = std::lower_bound(grad_offsets.begin(), grad_offsets.end(), out_offsets[i]);
-      auto j = low - grad_offsets.begin();
-      if (j < grad_nnz && out_offsets[i] == grad_offsets[j]) {
-        if (LogSoftMax) {
-          auto r = log_softmax_backward_cpu(grad_values[j], out_values[i], dim - sparse_dim, unused);
-          values[i].copy_(r);
-        } else {
-          auto r = softmax_backward_cpu(grad_values[j], out_values[i], dim - sparse_dim, unused);
-          values[i].copy_(r);
+    Tensor unused;
+    if (out_offsets == grad_offsets) {
+      if (LogSoftMax) {
+        auto r = log_softmax_backward_cpu(grad_values, out_values, dim - sparse_dim + 1, unused);
+        values.set_(r);
+      } else {
+        auto r = softmax_backward_cpu(grad_values, out_values, dim - sparse_dim + 1, unused);
+        values.set_(r);
+      }
+    } else {
+      for(int64_t i=0; i<out_nnz; i++) {
+        auto low = std::lower_bound(grad_offsets.begin(), grad_offsets.end(), out_offsets[i]);
+        auto j = low - grad_offsets.begin();
+        if (j < grad_nnz && out_offsets[i] == grad_offsets[j]) {
+          if (LogSoftMax) {
+            auto r = log_softmax_backward_cpu(grad_values[j], out_values[i], dim - sparse_dim, unused);
+            values[i].copy_(r);
+          } else {
+            auto r = softmax_backward_cpu(grad_values[j], out_values[i], dim - sparse_dim, unused);
+            values[i].copy_(r);
+          }
         }
       }
     }
@@ -503,36 +514,36 @@ void cpu_sparse_coo_softmax_backward(Tensor& grad_input, const Tensor& grad, con
     });
 }
 
-} // namespace
+} // anonymous namespace
 
-Tensor softmax_sparse_cpu(const Tensor& input_, const int64_t dim_, const bool half_to_float) {
-  TORCH_INTERNAL_ASSERT(input_.is_sparse());
-  TORCH_CHECK(!half_to_float, "softmax with half to float conversion is not supported on CPU");
-  auto input = input_.coalesce();
-  Tensor output = at::native::empty_like(input);
+Tensor softmax_sparse_cpu(
+    const Tensor& input_,
+    const int64_t dim,
+    const bool half_to_float) {
+  Tensor input, output;
+  std::tie(input, output) = softmax_sparse_input_preprocessing(
+      input_, dim, half_to_float, "softmax");
   if (input.numel() == 0) {
     return output;
   }
-  TORCH_CHECK(dim_ >= 0 && dim_ < input.dim(),
-              "dim must be non-negative and less than input dimensions");
   AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "softmax", [&] {
-      cpu_sparse_coo_softmax<scalar_t, false>(output, input, dim_);
+    cpu_sparse_coo_softmax<scalar_t, false>(output, input, dim);
   });
   return output;
 }
 
-Tensor log_softmax_sparse_cpu(const Tensor& input_, const int64_t dim_, const bool half_to_float) {
-  TORCH_INTERNAL_ASSERT(input_.is_sparse());
-  TORCH_CHECK(!half_to_float, "log_softmax with half to float conversion is not supported on CPU");
-  auto input = input_.coalesce();
-  Tensor output = at::native::empty_like(input);
+Tensor log_softmax_sparse_cpu(
+    const Tensor& input_,
+    const int64_t dim,
+    const bool half_to_float) {
+  Tensor input, output;
+  std::tie(input, output) = softmax_sparse_input_preprocessing(
+      input_, dim, half_to_float, "log_softmax");
   if (input.numel() == 0) {
     return output;
   }
-  TORCH_CHECK(dim_ >= 0 && dim_ < input.dim(),
-              "dim must be non-negative and less than input dimensions");
   AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_softmax", [&] {
-      cpu_sparse_coo_softmax<scalar_t, true>(output, input, dim_);
+    cpu_sparse_coo_softmax<scalar_t, true>(output, input, dim);
   });
   return output;
 }
@@ -542,26 +553,16 @@ Tensor softmax_backward_sparse_cpu(
     const Tensor& output_,
     int64_t dim_,
     const Tensor& input_) {
-  TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2};
-  checkSameSize("softmax_backward", grad_arg, output_arg);
-
-  int64_t dim = maybe_wrap_dim(dim_, grad_.dim());
-
-  auto grad = grad_.coalesce();
-  auto output = output_.coalesce();
-
-  Tensor grad_input = at::native::empty_like(output);
+  Tensor grad_input, grad, output;
+  std::tie(grad_input, grad, output) =
+      softmax_backward_sparse_input_preprocessing(
+          grad_, output_, dim_, input_, "softmax_backward");
   if (output.numel() == 0) {
     return grad_input;
   }
-  TORCH_CHECK(
-      dim >= 0 && dim < grad.dim(),
-      "dim must be non-negative and less than input dimensions");
-  TORCH_CHECK(
-              grad.sparse_dim() == output.sparse_dim(),
-      "grad and output sparse dimensions must be equal");
   AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "softmax_backward", [&] {
-      cpu_sparse_coo_softmax_backward<scalar_t, false>(grad_input, grad, output, dim);
+    cpu_sparse_coo_softmax_backward<scalar_t, false>(
+        grad_input, grad, output, dim_);
   });
   return grad_input;
 }
@@ -571,26 +572,16 @@ Tensor log_softmax_backward_sparse_cpu(
     const Tensor& output_,
     int64_t dim_,
     const Tensor& input_) {
-  TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2};
-  checkSameSize("log_softmax_backward", grad_arg, output_arg);
-
-  int64_t dim = maybe_wrap_dim(dim_, grad_.dim());
-
-  auto grad = grad_.coalesce();
-  auto output = output_.coalesce();
-
-  Tensor grad_input = at::native::empty_like(output);
+  Tensor grad_input, grad, output;
+  std::tie(grad_input, grad, output) =
+      softmax_backward_sparse_input_preprocessing(
+          grad_, output_, dim_, input_, "log_softmax_backward");
   if (output.numel() == 0) {
     return grad_input;
   }
-  TORCH_CHECK(
-      dim >= 0 && dim < grad.dim(),
-      "dim must be non-negative and less than input dimensions");
-  TORCH_CHECK(
-              grad.sparse_dim() == output.sparse_dim(),
-      "grad and output sparse dimensions must be equal");
-  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "softmax_backward", [&] {
-      cpu_sparse_coo_softmax_backward<scalar_t, true>(grad_input, grad, output, dim);
+  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "log_softmax_backward", [&] {
+    cpu_sparse_coo_softmax_backward<scalar_t, true>(
+        grad_input, grad, output, dim_);
   });
   return grad_input;
 }
diff --git a/aten/src/ATen/native/sparse/cuda/SoftMax.cu b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
diff --git a/test/test_sparse.py b/test/test_sparse.py