jacobmou · Sep 25, 2020
diff --git a/‎aten/src/ATen/LegacyTHFunctionsCPU.cpp
-255 b/‎aten/src/ATen/LegacyTHFunctionsCPU.cpp
-255
diff --git a/‎aten/src/ATen/LegacyTHFunctionsCPU.h
-3 b/‎aten/src/ATen/LegacyTHFunctionsCPU.h
-3
diff --git a/‎aten/src/ATen/cuda/CUDABlas.cpp
-40 b/‎aten/src/ATen/cuda/CUDABlas.cpp
-40
diff --git a/‎aten/src/ATen/native/LinearAlgebra.cpp
+37-26 b/‎aten/src/ATen/native/LinearAlgebra.cpp
+37-26
diff --git a/‎aten/src/ATen/native/cuda/LinearAlgebra.cu
-114 b/‎aten/src/ATen/native/cuda/LinearAlgebra.cu
-114
diff --git a/‎aten/src/ATen/native/native_functions.yaml
-17 b/‎aten/src/ATen/native/native_functions.yaml
-17
diff --git a/‎aten/src/TH/generic/THBlas.cpp
-49 b/‎aten/src/TH/generic/THBlas.cpp
-49
diff --git a/‎aten/src/TH/generic/THBlas.h
-3 b/‎aten/src/TH/generic/THBlas.h
-3
@@ -39,9 +39,6 @@ Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
 Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max);
 Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max);
 Tensor _th_trace(const Tensor & self);
-Tensor & _th_addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
-Tensor _th_addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
-Tensor & _th_addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
 std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A);
 std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A);
 std::tuple<Tensor &,Tensor &> _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors);
 
@@ -498,46 +498,6 @@ void gemv<at::BFloat16>(CUDABLAS_GEMV_ARGTYPES(at::BFloat16)) {
 }
 #endif
 
-namespace {
-template<typename scalar_t>
-cublasStatus_t cublasGer(const cublasHandle_t &handle, int64_t m, int64_t n, scalar_t *alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda) {
-  TORCH_CHECK(false, "cublas ger is defined only for float and double");
-  return {};
-}
-template<>
-cublasStatus_t cublasGer<float>(const cublasHandle_t &handle, int64_t m, int64_t n, float *alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda) {
-  return cublasSger(handle, m, n, alpha, x, incx, y, incy, a, lda);
-}
-template<>
-cublasStatus_t cublasGer<double>(const cublasHandle_t &handle, int64_t m, int64_t n, double *alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda) {
-  return cublasDger(handle, m, n, alpha, x, incx, y, incy, a, lda);
-}
-} // anonymous namespace
-
-template<typename scalar_t>
-void ger(int64_t m, int64_t n, scalar_t alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda)
-{
-  _cublasAdjustLdLevel2(m, n, &lda);
-  TORCH_CHECK((m <= INT_MAX) &&
-              (n <= INT_MAX) &&
-              (lda <= INT_MAX) &&
-              (incx <= INT_MAX) &&
-              (incy <= INT_MAX),
-              "cublasSger/cublasDger only supports m, n, lda, incx, incy with "
-              "the bound [val] <= %d", INT_MAX);
-  int i_m = (int)m;
-  int i_n = (int)n;
-  int i_lda = (int)lda;
-  int i_incx = (int)incx;
-  int i_incy = (int)incy;
-
-  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-  TORCH_CUDABLAS_CHECK(cublasGer<scalar_t>(
-    handle, i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda));
-}
-template void ger<float>(int64_t m, int64_t n, float alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda);
-template void ger<double>(int64_t m, int64_t n, double alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda);
-
 /* LEVEL 1 BLAS FUNCTIONS */
 
 template <>
 
@@ -143,50 +143,61 @@ static void check_1d(const Tensor& t, const char* arg, const char* fn) {
 }
 
 Tensor addr(const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
-  check_1d(vec1, "vec1", "addr");
-  check_1d(vec2, "vec2", "addr");
-  Tensor b_self;
-  std::tie(b_self) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr");
-  return at::_addr(b_self, vec1, vec2, beta, alpha);
+  TORCH_WARN(
+    "torch.addr is deprecated and may be removed in a future PyTorch release. "
+    "This function can be implemented using torch.outer as "
+    "alpha * torch.outer(vec1, vec2) + beta * input when beta is not zero, "
+    "alpha * torch.outer(vec1, vec2) when beta is zero.");
+
+  Tensor outer_result = at::outer(vec1, vec2) * alpha;
+  if (beta.to<double>() == 0.0) {
+    return outer_result;
+  }
+  return outer_result + (self * beta);
 }
 
 Tensor& addr_(Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
-  check_1d(vec1, "vec1", "addr");
-  check_1d(vec2, "vec2", "addr");
-  return at::_addr_(self, vec1, vec2, beta, alpha);
+  return at::addr_out(self, self, vec1, vec2, beta, alpha);
 }
 
 Tensor& addr_out(Tensor &result, const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
-  check_1d(vec1, "vec1", "addr");
-  check_1d(vec2, "vec2", "addr");
-  Tensor b_self;
-  std::tie(b_self) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr_out");
-  return at::_addr_out(result, b_self, vec1, vec2, beta, alpha);
+  auto addr_result = at::addr(self, vec1, vec2, beta, alpha);
+  // Validates safe casting
+  const auto result_dtype = addr_result.scalar_type();
+  TORCH_CHECK(canCast(result_dtype, result.scalar_type()),
+              "result type ", result_dtype,
+              " can't be cast to the desired output type ", result.scalar_type());
+
+  at::native::resize_output(result, addr_result.sizes().vec());
+  result.copy_(addr_result);
+  return result;
 }
 
+// torch.ger, alias for torch.outer
 Tensor& ger_out(Tensor &result, const Tensor& self, const Tensor& vec2) {
-  check_1d(self, "self", "ger");
-  check_1d(vec2, "vec2", "ger");
-  if (result.dim() != 2 || result.size(0) != self.size(0) || result.size(1) != vec2.size(0)) {
-    result.resize_({ self.size(0), vec2.size(0) });
-  }
-  // resize_ does the "broadcasting", don't need to broadcast again.
-  return at::_addr_out(result, result, self, vec2, Scalar(0), Scalar(1));
+  TORCH_WARN("torch.ger is deprecated and will be removed in a future PyTorch release. "
+             "Use torch.outer instead.");
+  return at::outer_out(result, self, vec2);
 }
 
 Tensor ger(const Tensor& self, const Tensor& vec2) {
-  Tensor result = at::empty({0}, self.options());
-  at::ger_out(result, self, vec2);
-  return result;
+  return self.outer(vec2);
 }
 
-// torch.outer, alias for torch.ger
 Tensor& outer_out(Tensor &result, const Tensor& self, const Tensor& vec2) {
-  return at::ger_out(result, self, vec2);
+  check_1d(self, "self", "outer");
+  check_1d(vec2, "vec2", "outer");
+
+  // torch.outer is implemented as a composite op using reshape and mul
+  at::mul_out(result, self.reshape({self.size(0), 1}), vec2);
+  return result;
 }
 
 Tensor outer(const Tensor& self, const Tensor& vec2) {
-  return self.ger(vec2);
+  check_1d(self, "self", "outer");
+  check_1d(vec2, "vec2", "outer");
+
+  return self.reshape({self.size(0), 1}) * vec2;
 }
 
 static void addmm_impl_cpu_(
 
@@ -178,120 +178,6 @@ Tensor& addmm__cuda(Tensor& self, const Tensor& mat1, const Tensor& mat2,
   return self;
 }
 
-template<typename scalar_t>
-void addr_impl_ger_cuda(Tensor &out, const Tensor &self,
-                        const Tensor& vec1, const Tensor& vec2,
-                        scalar_t alpha, scalar_t beta) {
-  static_assert(std::is_same<scalar_t, float>::value ||
-                std::is_same<scalar_t, double>::value,
-                "addr_impl_ger_cuda: only float and double are supported");
-  if (&out != &self) {
-    at::native::resize_as_(out, self);
-    at::native::copy_(out, self);
-  }
-  if (beta == 0.0) {
-    at::native::zero_(out);
-  }
-  if (beta != 1.0) {
-    at::native::mul_(out, beta);
-  }
-  if (out.stride(0) == 1) {
-    at::cuda::blas::ger<scalar_t>(
-      vec1.size(0), vec2.size(0), alpha,
-      vec1.data_ptr<scalar_t>(), vec1.stride(0),
-      vec2.data_ptr<scalar_t>(), vec2.stride(0),
-      out.data_ptr<scalar_t>(), out.stride(1)
-    );
-  } else if (out.stride(1) == 1) {
-    at::cuda::blas::ger<scalar_t>(
-      vec2.size(0), vec1.size(0), alpha,
-      vec2.data_ptr<scalar_t>(), vec2.stride(0),
-      vec1.data_ptr<scalar_t>(), vec1.stride(0),
-      out.data_ptr<scalar_t>(), out.stride(0)
-    );
-  } else {
-    Tensor cr = out.clone();
-    at::cuda::blas::ger<scalar_t>(
-      vec2.size(0), vec1.size(0), alpha,
-      vec2.data_ptr<scalar_t>(), vec2.stride(0),
-      vec1.data_ptr<scalar_t>(), vec1.stride(0),
-      out.data_ptr<scalar_t>(), out.stride(0)
-    );
-    out.set_(cr);
-  }
-}
-
-template<typename scalar_t>
-void addr_impl_cuda(Tensor &out, const Tensor &self,
-                    const Tensor& vec1, const Tensor& vec2,
-                    scalar_t alpha, scalar_t beta) {
-  // currently no Hger/SgerEx in Cublas.
-  Tensor vec2T = vec2.reshape({1, vec2.size(0)});
-  Tensor vec1M = vec1.reshape({vec1.size(0), 1});
-  addmm_out_cuda(out, self, vec1M, vec2T, beta, alpha);
-}
-template<>
-void addr_impl_cuda<float>(Tensor &out, const Tensor &self,
-                           const Tensor& vec1, const Tensor& vec2,
-                           float alpha, float beta) {
-  addr_impl_ger_cuda<float>(out, self, vec1, vec2, alpha, beta);
-}
-template<>
-void addr_impl_cuda<double>(Tensor &out, const Tensor &self,
-                            const Tensor& vec1, const Tensor& vec2,
-                            double alpha, double beta) {
-  addr_impl_ger_cuda<double>(out, self, vec1, vec2, alpha, beta);
-}
-
-Tensor& addr_out_cuda(Tensor &out, const Tensor& self,
-                      const Tensor& vec1, const Tensor& vec2,
-                      Scalar beta, Scalar alpha) {
-  TORCH_CHECK(vec1.dim() == 1 && vec2.dim() == 1,
-              "vec1 and vec2 should be 1-dimensional vectors. Got dimensions ",
-              vec1.dim(), " and ", vec2.dim());
-
-  Tensor self_;
-  if (&out != &self) {
-    std::tie(self_) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr");
-  } else {
-    self_ = self;
-  }
-
-  TORCH_CHECK(out.device() == self_.device() &&
-              out.device() == vec1.device() &&
-              out.device() == vec2.device(),
-              "Expected all tensors to be on the same device. Found: ",
-              out.device(), ", ", self_.device(), ", ",
-              vec1.device(), " and ", vec2.device());
-  TORCH_CHECK(self_.dim() == 2,
-              "2D tensor expected, got ", self_.dim(), "D tensor for input");
-  TORCH_CHECK(self_.size(0) == vec1.size(0) && self_.size(1) == vec2.size(0),
-              "size mismatch",
-              ", input: ", self_.sizes(),
-              ", v1: ", vec1.sizes(),
-              ", v2: ", vec2.sizes());
-  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, self_.scalar_type(), "addr_out_cuda", [&] {
-      addr_impl_cuda<scalar_t>(out, self_, vec1, vec2,
-                               alpha.to<scalar_t>(), beta.to<scalar_t>());
-  });
-  return out;
-}
-
-Tensor& addr__cuda(Tensor& self,
-                   const Tensor& vec1, const Tensor& vec2,
-                   Scalar beta, Scalar alpha) {
-  addr_out_cuda(self, self, vec1, vec2, beta, alpha);
-  return self;
-}
-
-Tensor addr_cuda(const Tensor& self,
-                  const Tensor& vec1, const Tensor& vec2,
-                  Scalar beta, Scalar alpha) {
-  Tensor out = at::empty({0}, self.options());
-  addr_out_cuda(out, self, vec1, vec2, beta, alpha);
-  return out;
-}
-
 Tensor& addbmm_out_cuda(Tensor& out, const Tensor& self,
                         const Tensor& batch1, const Tensor& batch2,
                         Scalar beta, Scalar alpha) {
 
@@ -6238,23 +6238,6 @@
   use_c10_dispatcher: full
   variants: method, function
 
-- func: _addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
-  dispatch:
-    CPU: legacy::cpu::_th_addr
-    CUDA: addr_cuda
-
-- func: _addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
-  dispatch:
-    CPU: legacy::cpu::_th_addr_
-    CUDA: addr__cuda
-
-- func: _addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU: legacy::cpu::_th_addr_out
-    CUDA: addr_out_cuda
-
 - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
   use_c10_dispatcher: full
   dispatch:
 
@@ -14,8 +14,6 @@ TH_EXTERNC void dcopy_(int *n, double *x, int *incx, double *y, int *incy);
 TH_EXTERNC void scopy_(int *n, float *x, int *incx, float *y, int *incy);
 TH_EXTERNC void daxpy_(int *n, double *a, double *x, int *incx, double *y, int *incy);
 TH_EXTERNC void saxpy_(int *n, float *a, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC void dger_(int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *a, int *lda);
-TH_EXTERNC void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda);
 
 void THBlas_(swap)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy)
 {
@@ -111,51 +109,4 @@ void THBlas_(axpy)(int64_t n, scalar_t a, scalar_t *x, int64_t incx, scalar_t *y
   }
 }
 
-void THBlas_(ger)(
-  int64_t m,
-  int64_t n,
-  scalar_t alpha,
-  scalar_t *x,
-  int64_t incx,
-  scalar_t *y,
-  int64_t incy,
-  scalar_t *a,
-  int64_t lda)
-{
-  if(n == 1)
-    lda = m;
-
-#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
-  if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) &&
-      (incx > 0) && (incx <= INT_MAX) &&
-      (incy > 0) && (incy <= INT_MAX) )
-  {
-    THArgCheck(lda >= THMax(1, m), 9,
-      "lda should be at least max(1, m=%d), but have %d", m, lda);
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_lda = (int)lda;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
-
-#if defined(TH_REAL_IS_DOUBLE)
-    dger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda);
-#else
-    sger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda);
-#endif
-    return;
-  }
-#endif
-  {
-    int64_t i, j;
-    for(j = 0; j < n; j++)
-    {
-      scalar_t *column_ = a+j*lda;
-      scalar_t z = alpha*y[j*incy];
-      for(i = 0; i < m; i++)
-        column_[i] += z*x[i*incx] ;
-    }
-  }
-}
-
 #endif
@@ -7,7 +7,4 @@ TH_API void THBlas_(swap)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int
 TH_API void THBlas_(copy)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy);
 TH_API void THBlas_(axpy)(int64_t n, scalar_t a, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy);
 
-/* Level 2 */
-TH_API void THBlas_(ger)(int64_t m, int64_t n, scalar_t alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda);
-
 #endif