adding a beta parameter to the smooth_l1 loss fn (pytorch#44433)

bdhirsh · facebook-github-bot · commit 439930c81b5c · 2020-09-25T16:36:28.000-07:00
Summary: Pull Request resolved: pytorch#44433 Not entirely sure why, but changing the type of beta from `float` to `double in autocast_mode.cpp and FunctionsManual.h fixes my compiler errors, failing instead at link time fixing some type errors, updated fn signature in a few more files removing my usage of Scalar, making beta a double everywhere instead Test Plan: Imported from OSS Reviewed By: mrshenli Differential Revision: D23636720 Pulled By: bdhirsh fbshipit-source-id: caea2a1f8dd72b3b5fd1d72dd886b2fcd690af6d
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
@@ -357,7 +357,7 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
   KERNEL(ADD_NS(hinge_embedding_loss), "hinge_embedding_loss", Tensor (const Tensor &, const Tensor &, double, int64_t), fp32)
   KERNEL(ADD_NS(kl_div), "kl_div", Tensor (const Tensor &, const Tensor &, int64_t, bool), fp32)
   KERNEL(ADD_NS(l1_loss), "l1_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
-  KERNEL(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
+  KERNEL(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t, double), fp32)
   KERNEL(ADD_NS(mse_loss), "mse_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
   KERNEL(ADD_NS(margin_ranking_loss), "margin_ranking_loss", Tensor (const Tensor &, const Tensor &, const Tensor &, double, int64_t), fp32)
   KERNEL(ADD_NS(multilabel_margin_loss), "multilabel_margin_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
diff --git a/aten/src/ATen/native/BinaryOps.h b/aten/src/ATen/native/BinaryOps.h
@@ -25,6 +25,7 @@ inline void sub_check(const Tensor& self, const Tensor& other) {
 }
 
 using binary_fn_alpha = void(*)(TensorIterator&, Scalar alpha);
+using binary_fn_beta = void(*)(TensorIterator&, double beta);
 using binary_fn = void(*)(TensorIterator&);
 using binary_clamp_fn_alpha =
     void(*)(TensorIterator&, Scalar alpha, Scalar min_val, Scalar max_val);
@@ -54,7 +55,7 @@ DECLARE_DISPATCH(binary_fn, max_elementwise_stub);
 DECLARE_DISPATCH(binary_fn, min_elementwise_stub);
 DECLARE_DISPATCH(binary_fn, maximum_stub);
 DECLARE_DISPATCH(binary_fn, minimum_stub);
-DECLARE_DISPATCH(binary_fn, smooth_l1_stub);
+DECLARE_DISPATCH(binary_fn_beta, smooth_l1_stub);
 DECLARE_DISPATCH(binary_fn, sigmoid_backward_stub);
 DECLARE_DISPATCH(binary_fn_alpha, logit_backward_stub);
 DECLARE_DISPATCH(binary_fn, tanh_backward_stub);
diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
@@ -295,38 +295,53 @@ Tensor soft_margin_loss(
   return output;
 }
 
-Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t reduction) {
+Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t reduction, double beta) {
+  if (beta <= 0)
+      return at::native::l1_loss(input, target, reduction);
   Tensor loss;
   auto iter = TensorIterator::binary_op(loss, input, target);
-  smooth_l1_stub(iter.device_type(), iter);
+  smooth_l1_stub(iter.device_type(), iter, beta);
   return apply_loss_reduction(iter.output(), reduction);
 }
 
-Tensor& smooth_l1_loss_out(Tensor& result, const Tensor& input, const Tensor& target, int64_t reduction) {
+Tensor& smooth_l1_loss_out(Tensor& result, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
+  if (beta <= 0)
+      return at::native::l1_loss_out(result, input, target, reduction);
   if (reduction != Reduction::None) {
-    result = at::smooth_l1_loss(input, target, reduction);
+    Tensor loss;
+    auto iter = TensorIterator::binary_op(loss, input, target);
+    smooth_l1_stub(iter.device_type(), iter, beta);
+    if (reduction == Reduction::Mean) {
+      at::mean_out(result, iter.output(), 0);
+    } else {
+      at::sum_out(result, iter.output(), 0);
+    }
   } else {
     auto iter = TensorIterator::binary_op(result, input, target);
-    smooth_l1_stub(iter.device_type(), iter);
+    smooth_l1_stub(iter.device_type(), iter, beta);
   }
   return result;
 }
 
-Tensor& smooth_l1_loss_backward_out(Tensor& grad_input, const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction) {
+Tensor& smooth_l1_loss_backward_out(Tensor& grad_input, const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
+  if (beta <= 0)
+      return at::native::l1_loss_backward_out(grad_input, grad_output, input, target, reduction);
   auto norm = reduction == Reduction::Mean ? 1. / input.numel() : 1.;
   auto iter = at::TensorIteratorConfig()
     .add_output(grad_input)
     .add_input(input)
     .add_input(target)
     .add_input(grad_output)
     .build();
-  smooth_l1_backward_stub(iter.device_type(), iter, norm);
+  smooth_l1_backward_stub(iter.device_type(), iter, norm, beta);
   return grad_input;
 }
 
-Tensor smooth_l1_loss_backward(const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction) {
+Tensor smooth_l1_loss_backward(const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
+  if (beta <= 0)
+      return at::native::l1_loss_backward(grad_output, input, target, reduction);
   auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  return at::smooth_l1_loss_backward_out(grad_input, grad_output, input, target, reduction);
+  return at::smooth_l1_loss_backward_out(grad_input, grad_output, input, target, reduction, beta);
 }
 
 Tensor mse_loss(const Tensor& input, const Tensor& target, int64_t reduction) {
diff --git a/aten/src/ATen/native/PointwiseOps.h b/aten/src/ATen/native/PointwiseOps.h
@@ -11,10 +11,11 @@ struct TensorIterator;
 namespace native {
 
 using pointwise_fn = void (*)(TensorIterator&, Scalar scalar);
+using pointwise_fn_beta = void (*)(TensorIterator&, Scalar scalar, double beta);
 
 DECLARE_DISPATCH(pointwise_fn, addcmul_stub);
 DECLARE_DISPATCH(pointwise_fn, addcdiv_stub);
-DECLARE_DISPATCH(pointwise_fn, smooth_l1_backward_stub);
+DECLARE_DISPATCH(pointwise_fn_beta, smooth_l1_backward_stub);
 DECLARE_DISPATCH(pointwise_fn, mse_backward_stub);
 
 } // namespace native
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -502,24 +502,25 @@ void minimum_kernel(TensorIterator& iter) {
   }
 }
 
-void smooth_l1_kernel(TensorIterator& iter) {
+void smooth_l1_kernel(TensorIterator& iter, double beta) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
         kBFloat16, kHalf, iter.dtype(), "smooth_l1_cpu", [&]() {
         using Vec = Vec256<scalar_t>;
-        const Vec one_vec(static_cast<scalar_t>(1));
+        const scalar_t beta_val(beta);
+        const Vec beta_val_vec(beta_val);
         const Vec point_five_vec(static_cast<scalar_t>(0.5));
         cpu_kernel_vec(
             iter,
-            [](scalar_t a, scalar_t b) -> scalar_t {
+            [&beta_val](scalar_t a, scalar_t b) -> scalar_t {
               auto z = std::abs(a - b);
-              return z < static_cast<scalar_t>(1)
-                  ? static_cast<scalar_t>(0.5) * z * z
-                  : z - static_cast<scalar_t>(0.5);
+              return z < beta_val
+                  ? static_cast<scalar_t>(0.5) * z * z / beta_val
+                  : z - static_cast<scalar_t>(0.5) * beta_val;
             },
-            [&one_vec, &point_five_vec](Vec a, Vec b) {
+            [&beta_val_vec, &point_five_vec](Vec a, Vec b) {
               auto z = (a - b).abs();
               return Vec::blendv(
-                  point_five_vec * z * z, z - point_five_vec, z >= one_vec);
+                  point_five_vec * z * z / beta_val_vec, z - point_five_vec * beta_val_vec, z >= beta_val_vec);
             });
       });
 }
diff --git a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
@@ -46,28 +46,39 @@ static void addcdiv_cpu_kernel(TensorIterator& iter, Scalar value) {
   });
 }
 
-static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, Scalar norm) {
+static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, Scalar norm, double beta) {
   ScalarType dtype = iter.dtype(0);
   AT_DISPATCH_ALL_TYPES(dtype, "smooth_l1_backward_cpu_out", [&] {
     auto norm_val = norm.to<scalar_t>();
+    scalar_t beta_val(beta);
     auto norm_val_vec = Vec256<scalar_t>(norm_val);
+    auto beta_val_vec = Vec256<scalar_t>(beta_val);
     const auto neg_1_vec = Vec256<scalar_t>(-1);
+    const auto zero_vec = Vec256<scalar_t>(0);
     const auto pos_1_vec = Vec256<scalar_t>(1);
     cpu_kernel_vec(iter,
       [=](scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
         const auto x = input - target;
-        if (x < -1.)
+        if (x <= -beta)
           return -norm_val * grad_output;
-        else if (x > 1.)
+        else if (x >= beta)
           return norm_val * grad_output;
         else
-          return norm_val * x * grad_output;
+          return norm_val * x * grad_output / beta;
       },
-      [norm_val_vec, neg_1_vec, pos_1_vec](
+      [norm_val_vec, beta_val_vec, neg_1_vec, zero_vec, pos_1_vec](
          Vec256<scalar_t> input, Vec256<scalar_t> target, Vec256<scalar_t> grad_output) -> Vec256<scalar_t> {
-        auto x = input - target;
-        x = clamp(x, neg_1_vec, pos_1_vec);
-        return norm_val_vec * x * grad_output;
+        // using two blendv calls to simulate the 3 cases
+        // 1        if  x >= beta
+        // -1       if x <= -beta
+        // x / beta if |x| < beta
+        const auto x = input - target;
+        const auto pos_or_neg_1_vec = Vec256<scalar_t>::blendv(
+            neg_1_vec, pos_1_vec, x > zero_vec);
+        const auto x_abs = x.abs();
+        const auto output = Vec256<scalar_t>::blendv(
+            x / beta_val_vec, pos_or_neg_1_vec, x_abs >= beta_val_vec);
+        return norm_val_vec * output * grad_output;
       }
     );
   });
diff --git a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
@@ -19,11 +19,12 @@ void atan2_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void smooth_l1_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "smooth_l1_cuda", [&]() {
-    gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
+void smooth_l1_kernel_cuda(TensorIterator& iter, double beta) {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "smooth_l1_cuda", [&iter, beta]() {
+    scalar_t beta_val(beta);
+    gpu_kernel(iter, [beta_val] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
       auto z = ::abs(a - b);
-      return z < scalar_t(1.) ? scalar_t(0.5) * z * z : z - scalar_t(0.5);
+      return z < beta_val ? scalar_t(0.5) * z * z / beta_val : z - scalar_t(0.5) * beta_val;
     });
   });
 }
diff --git a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
@@ -26,17 +26,18 @@ void addcdiv_cuda_kernel(TensorIterator& iter, Scalar value) {
   });
 }
 
-void smooth_l1_backward_cuda_kernel(TensorIterator& iter, Scalar norm) {
-  AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "smooth_l1_backward_cuda", [&]() {
+void smooth_l1_backward_cuda_kernel(TensorIterator& iter, Scalar norm, double beta) {
+  AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "smooth_l1_backward_cuda", [&iter, &norm, beta] {
       auto norm_val = norm.to<scalar_t>();
-      gpu_kernel(iter, [norm_val]GPU_LAMBDA(scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
+      scalar_t beta_val(beta);
+      gpu_kernel(iter, [norm_val, beta_val]GPU_LAMBDA(scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
         const auto x = input - target;
-        if (x < scalar_t(-1))
+        if (x < -beta_val)
           return -norm_val * grad_output;
-        else if (x > scalar_t(1))
+        else if (x > beta_val)
           return norm_val * grad_output;
         else
-          return norm_val * x * grad_output;
+          return norm_val * x * grad_output / beta_val;
     });
   });
 }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -6767,25 +6767,25 @@
     CPU: nll_loss2d_backward_cpu
     CUDA: legacy::cuda::_thnn_nll_loss2d_backward
 
-- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: smooth_l1_loss_out
     CUDA: smooth_l1_loss_out
 
-- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: smooth_l1_loss
 
-- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
+- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: smooth_l1_loss_backward_out
     CUDA: smooth_l1_loss_backward_out
 
-- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
+- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
 
diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
@@ -246,6 +246,18 @@ TEST_F(FunctionalTest, SmoothL1LossDefaultOptions) {
   ASSERT_TRUE(input.sizes() == input.grad().sizes());
 }
 
+TEST_F(FunctionalTest, SmoothL1LossBeta) {
+  auto input = torch::tensor({0.1, 1.5, 10.0}, torch::dtype(torch::kFloat).requires_grad(true));
+  auto target = torch::tensor({0., 1., 5.}, torch::kFloat);
+  auto output =
+      F::smooth_l1_loss(input, target, /*reduction=*/torch::kMean, /*beta=*/0.5);
+  auto expected = torch::tensor(1.67, torch::kFloat);
+  auto s = output.sum();
+  s.backward();
+  ASSERT_TRUE(output.allclose(expected));
+  ASSERT_TRUE(input.sizes() == input.grad().sizes());
+}
+
 TEST_F(FunctionalTest, SmoothL1LossNoReduction) {
   auto input = torch::tensor({0.1, 1.2, 4.7}, torch::dtype(torch::kFloat).requires_grad(true));
   auto target = torch::tensor({0., 1., 5.}, torch::kFloat);
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
@@ -1221,9 +1221,9 @@
   self: nll_loss2d_backward(grad, self, target, weight, reduction, ignore_index, total_weight)
   target: non_differentiable
 
-- name: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
-  self: smooth_l1_loss_backward(grad, self, target, reduction)
-  target: smooth_l1_loss_backward(grad, target, self, reduction)
+- name: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
+  self: smooth_l1_loss_backward(grad, self, target, reduction, beta)
+  target: smooth_l1_loss_backward(grad, target, self, reduction, beta)
 
 - name: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
   self: soft_margin_loss_backward(grad, self, target, reduction)
@@ -1589,10 +1589,10 @@
   grad_output: replication_pad3d(grad, padding)
   self: zeros_like(self)
 
-- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
-  grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction)
-  self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction)
-  target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction)
+- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0) -> Tensor
+  grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction, beta)
+  self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
+  target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
 
 - name: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor
   grad_output: softplus_backward(grad, self, beta, threshold, output)
diff --git a/torch/csrc/api/include/torch/nn/functional/loss.h b/torch/csrc/api/include/torch/nn/functional/loss.h
@@ -307,25 +307,26 @@ inline Tensor cosine_embedding_loss(
 
 // ============================================================================
 
-inline Tensor _smooth_l1_loss(const Tensor& input, const Tensor& target) {
+inline Tensor _smooth_l1_loss(const Tensor& input, const Tensor& target, double beta = 1.) {
     auto t = torch::abs(input - target);
-    return torch::where(t < 1, 0.5 * torch::pow(t, 2), t - 0.5);
+    return torch::where(t < beta, 0.5 * torch::pow(t, 2) / beta, t - 0.5 * beta);
 }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
 namespace detail {
 inline Tensor smooth_l1_loss(
     const Tensor& input,
     const Tensor& target,
-    SmoothL1LossFuncOptions::reduction_t reduction) {
+    SmoothL1LossFuncOptions::reduction_t reduction,
+    double beta = 1.) {
   if (target.sizes() != input.sizes()) {
     TORCH_WARN("Using a target size (", target.sizes(), ") that is different to the input size (", input.sizes(), "). ",
                   "This will likely lead to incorrect results due to broadcasting. ",
                   "Please ensure they have the same size.");
   }
 
   std::vector<Tensor> expanded_tensors = torch::broadcast_tensors({input, target});
-  return torch::smooth_l1_loss(expanded_tensors[0], expanded_tensors[1], enumtype::reduction_get_enum(reduction));
+  return torch::smooth_l1_loss(expanded_tensors[0], expanded_tensors[1], enumtype::reduction_get_enum(reduction), beta);
 }
 } // namespace detail
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
@@ -344,8 +345,9 @@ inline Tensor smooth_l1_loss(
 inline Tensor smooth_l1_loss(
     const Tensor& input,
     const Tensor& target,
-    const SmoothL1LossFuncOptions& options = {}) {
-  return detail::smooth_l1_loss(input, target, options.reduction());
+    const SmoothL1LossFuncOptions& options = {},
+    double beta = 1.) {
+  return detail::smooth_l1_loss(input, target, options.reduction(), beta);
 }
 
 // ============================================================================
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
@@ -957,20 +957,24 @@ Tensor l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & i
   return output;
 }
 
-Tensor smooth_l1_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction) {
+Tensor smooth_l1_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction, double beta) {
+  // special case to protect against a divide-by-zero.
+  if (beta == 0) {
+      return at::zeros(grad.sizes(), grad.options());
+  }
   auto d = (input - target).abs();
-  auto grad_input = grad * (d < 1).type_as(grad);
+  auto grad_input = grad * (d < beta).type_as(grad) / beta;
   if (reduction == at::Reduction::Mean) {
     grad_input /= input.numel();
   }
   return grad_input;
 }
 
-Tensor smooth_l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction) {
+Tensor smooth_l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction, double beta) {
   if (reduction == at::Reduction::None) {
-    return smooth_l1_loss_backward(grad, input, target, reduction);
+    return smooth_l1_loss_backward(grad, input, target, reduction, beta);
   }
-  auto r = smooth_l1_loss_backward(ones_like(grad_output), input, target, reduction);
+  auto r = smooth_l1_loss_backward(ones_like(grad_output), input, target, reduction, beta);
   return (r * grad).sum();
 }
 
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
@@ -104,8 +104,8 @@ at::Tensor log_softmax_double_backward(const at::Tensor & grad, const at::Tensor
 at::Tensor binary_cross_entropy_double_backward(const at::Tensor & grad_output, const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, const c10::optional<at::Tensor>& weight, int64_t reduction);
 at::Tensor binary_cross_entropy_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, const c10::optional<at::Tensor>& weight, int64_t reduction);
 at::Tensor l1_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
-at::Tensor smooth_l1_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
-at::Tensor smooth_l1_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
+at::Tensor smooth_l1_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction, double beta);
+at::Tensor smooth_l1_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & target, int64_t reduction, double beta);
 at::Tensor mse_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, int64_t reduction);
 at::Tensor mse_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
 at::Tensor soft_margin_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
diff --git a/torch/overrides.py b/torch/overrides.py
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py

Original file line number	Diff line number	Diff line change
`@@ -957,20 +957,24 @@ Tensor l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & i`
`957`	`957`	`return output;`
`958`	`958`	`}`
`959`	`959`
`960`		`-Tensor smooth_l1_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction) {`
	`960`	`+Tensor smooth_l1_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction, double beta) {`
	`961`	`+ // special case to protect against a divide-by-zero.`
	`962`	`+ if (beta == 0) {`
	`963`	`+ return at::zeros(grad.sizes(), grad.options());`
	`964`	`+ }`
`961`	`965`	`auto d = (input - target).abs();`
`962`		`- auto grad_input = grad * (d < 1).type_as(grad);`
	`966`	`+ auto grad_input = grad * (d < beta).type_as(grad) / beta;`
`963`	`967`	`if (reduction == at::Reduction::Mean) {`
`964`	`968`	`grad_input /= input.numel();`
`965`	`969`	`}`
`966`	`970`	`return grad_input;`
`967`	`971`	`}`
`968`	`972`
`969`		`-Tensor smooth_l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction) {`
	`973`	`+Tensor smooth_l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction, double beta) {`
`970`	`974`	`if (reduction == at::Reduction::None) {`
`971`		`- return smooth_l1_loss_backward(grad, input, target, reduction);`
	`975`	`+ return smooth_l1_loss_backward(grad, input, target, reduction, beta);`
`972`	`976`	`}`
`973`		`- auto r = smooth_l1_loss_backward(ones_like(grad_output), input, target, reduction);`
	`977`	`+ auto r = smooth_l1_loss_backward(ones_like(grad_output), input, target, reduction, beta);`
`974`	`978`	`return (r * grad).sum();`
`975`	`979`	`}`
`976`	`980`