jacobmou · Sep 24, 2020
diff --git a/‎aten/src/ATen/native/ForeachOpsKernels.cpp
-24 b/‎aten/src/ATen/native/ForeachOpsKernels.cpp
-24
diff --git a/‎aten/src/ATen/native/ForeachUtils.h
-14 b/‎aten/src/ATen/native/ForeachUtils.h
-14
diff --git a/‎aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
-60 b/‎aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
-60
diff --git a/‎aten/src/ATen/native/cuda/ForeachFunctors.cuh
-115 b/‎aten/src/ATen/native/cuda/ForeachFunctors.cuh
-115
diff --git a/‎aten/src/ATen/native/cuda/MultiTensorApply.cuh
-70 b/‎aten/src/ATen/native/cuda/MultiTensorApply.cuh
-70
diff --git a/‎aten/src/ATen/native/native_functions.yaml
+5-92 b/‎aten/src/ATen/native/native_functions.yaml
+5-92
diff --git a/‎test/backward_compatibility/check_backward_compatibility.py
-4 b/‎test/backward_compatibility/check_backward_compatibility.py
-4
diff --git a/‎test/test_foreach.py
+108-421 b/‎test/test_foreach.py
+108-421
diff --git a/‎test/test_native_functions.py
+1-1 b/‎test/test_native_functions.py
+1-1
diff --git a/‎tools/autograd/gen_python_functions.py
-1 b/‎tools/autograd/gen_python_functions.py
-1
diff --git a/‎tools/autograd/templates/python_torch_functions.cpp
-1 b/‎tools/autograd/templates/python_torch_functions.cpp
-1
diff --git a/‎tools/codegen/model.py
-4 b/‎tools/codegen/model.py
-4
diff --git a/‎tools/pyi/gen_pyi.py
-1 b/‎tools/pyi/gen_pyi.py
-1
diff --git a/‎torch/csrc/utils/python_arg_parser.cpp
+1-21 b/‎torch/csrc/utils/python_arg_parser.cpp
+1-21
diff --git a/‎torch/csrc/utils/python_arg_parser.h
+4-14 b/‎torch/csrc/utils/python_arg_parser.h
+4-14
@@ -24,26 +24,6 @@ std::vector<Tensor> foreach_tensor_##NAME##_scalar_kernel_slow(TensorList tensor
   return result;                                                                                          \
 }
 
-#define FOREACH_BINARY_OP_SCALARLIST(NAME)                                                                              \
-void foreach_tensor_##NAME##_scalarlist_kernel_slow_(TensorList tensors, at::ArrayRef<double> scalars) {                \
-  check_foreach_api_restrictions(tensors, scalars);                                                                     \
-                                                                                                                        \
-  for (int i = 0; i < tensors.size(); i++) {                                                                            \
-      tensors[i].NAME##_(scalars[i]);                                                                                   \
-    }                                                                                                                   \
-}                                                                                                                       \
-                                                                                                                        \
-std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_kernel_slow(TensorList tensors, at::ArrayRef<double> scalars) {  \
-  check_foreach_api_restrictions(tensors, scalars);                                                                     \
-  std::vector<Tensor> result;                                                                                           \
-  result.reserve(tensors.size());                                                                                       \
-  for (int i = 0; i < tensors.size(); i++) {                                                                            \
-    result.emplace_back(tensors[i].NAME(scalars[i]));                                                                   \
-  }                                                                                                                     \
-                                                                                                                        \
-  return result;                                                                                                        \
-}
-
 #define FOREACH_BINARY_OP_LIST(NAME)                                                                      \
 std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_slow(TensorList tensors1, TensorList tensors2) {  \
   check_foreach_api_restrictions(tensors1, tensors2);                                                     \
@@ -137,10 +117,6 @@ FOREACH_BINARY_OP_SCALAR(add);
 FOREACH_BINARY_OP_SCALAR(sub);
 FOREACH_BINARY_OP_SCALAR(mul);
 FOREACH_BINARY_OP_SCALAR(div);
-FOREACH_BINARY_OP_SCALARLIST(add);
-FOREACH_BINARY_OP_SCALARLIST(sub);
-FOREACH_BINARY_OP_SCALARLIST(mul);
-FOREACH_BINARY_OP_SCALARLIST(div);
 FOREACH_BINARY_OP_LIST(mul);
 FOREACH_BINARY_OP_LIST(div);
 FOREACH_UNARY_OP(sqrt);
 
@@ -31,12 +31,6 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2) {
   }
 }
 
-void check_foreach_api_restrictions(TensorList tensors, ArrayRef<double> scalars) {
-  TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
-  TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");
-  TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");
-}
-
 // To go via 'fast' path, several conditions must be satisfied
 // - All tensors must be on the same device
 // - All tensors must have strided layout
@@ -138,13 +132,5 @@ bool can_use_fast_route(TensorList tensors) {
   return true;
 }
 
-bool can_use_fast_route(TensorList tensors, ArrayRef<double> scalars) {
-  TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
-  TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");
-  TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");
-
-  return can_use_fast_route(tensors);
-}
-
 }
 }} // at::native
@@ -118,121 +118,6 @@ struct BinaryOpScalarFunctor {
         }
 };
 
-template<typename T, template<class> class Op>
-struct BinaryOpScalarListFunctor_ {
-    __device__ void operator() (
-        int chunk_size,
-        TensorListScalarListMetadata<1>& tl) {
-            int tensor_loc = tl.block_to_tensor[blockIdx.x];
-            int chunk_idx = tl.block_to_chunk[blockIdx.x];
-            int n = tl.sizes[tensor_loc];
-
-            T* x = (T*)tl.addresses[0][tensor_loc];
-            x += chunk_idx * chunk_size;
-
-            double y = tl.scalar_vals[tensor_loc];
-
-            n -= chunk_idx * chunk_size;
-
-            T r_x[kILP];
-
-            // to make things simple, we put aligned case in a different code path
-            if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x)) {
-                for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) {
-                    // load
-                    load_store(r_x, x, 0 , i_start);
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
-                    }
-                    // store
-                    load_store(x, r_x, i_start, 0);
-                }
-            }
-            else {
-                for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = 0;
-                        int i = i_start + threadIdx.x + ii * blockDim.x;
-                        if(i < n && i < chunk_size) {
-                            r_x[ii] = x[i];
-                        }
-                    }
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
-                    }
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        int i = i_start + threadIdx.x + ii * blockDim.x;
-                        if(i < n && i < chunk_size)
-                            x[i] = r_x[ii];
-                    }
-                }
-            }
-        }
-};
-
-template<typename T, template<class> class Op>
-struct BinaryOpScalarListFunctor {
-    __device__ void operator() (
-        int chunk_size,
-        TensorListScalarListMetadata<2>& tl) {
-            int tensor_loc = tl.block_to_tensor[blockIdx.x];
-            int chunk_idx = tl.block_to_chunk[blockIdx.x];
-            int n = tl.sizes[tensor_loc];
-
-            T* x = (T*)tl.addresses[0][tensor_loc];
-            x += chunk_idx * chunk_size;
-
-            T* out = (T*)tl.addresses[1][tensor_loc];
-            out += chunk_idx * chunk_size;
-
-            double y = tl.scalar_vals[tensor_loc];
-
-            n -= chunk_idx * chunk_size;
-
-            T r_x[kILP];
-
-            // to make things simple, we put aligned case in a different code path
-            if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x) && is_aligned(out)) {
-                for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) {
-                    // load
-                    load_store(r_x, x, 0 , i_start);
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
-                    }
-                    // store
-                    load_store(out, r_x, i_start, 0);
-                }
-            }
-            else {
-                for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = 0;
-                        int i = i_start + threadIdx.x + ii * blockDim.x;
-                        if(i < n && i < chunk_size) {
-                            r_x[ii] = x[i];
-                        }
-                    }
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
-                    }
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        int i = i_start + threadIdx.x + ii * blockDim.x;
-                        if(i < n && i < chunk_size)
-                            out[i] = r_x[ii];
-                    }
-                }
-            }
-        }
-};
-
 template<typename T, template<class> class Op>
 struct BinaryOpListAlphaFunctor_ {
     __device__ void operator() (
 
@@ -26,7 +26,6 @@ __device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int s
 // TensorListMetadata has to be < 4KB - the limit for kernel launch argument
 static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
 static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
-static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30};
 
 template<int n> struct TensorListMetadata
 {
@@ -36,15 +35,6 @@ template<int n> struct TensorListMetadata
   int block_to_chunk[depth_to_max_blocks[n-1]];
 };
 
-template<int n> struct TensorListScalarListMetadata
-{
-  void* addresses[n][depth_to_max_tensors_scalarlist[n-1]];
-  int sizes[depth_to_max_tensors_scalarlist[n-1]];
-  double scalar_vals[depth_to_max_tensors_scalarlist[n-1]];
-  unsigned char block_to_tensor[depth_to_max_blocks[n-1]];
-  int block_to_chunk[depth_to_max_blocks[n-1]];
-};
-
 template<typename T, typename U, typename... ArgTypes>
 C10_LAUNCH_BOUNDS_1(kBlockSize)
 __global__ void 
@@ -59,71 +49,11 @@ multi_tensor_apply_kernel(
 template<int depth, typename T, typename... ArgTypes>
 void multi_tensor_apply(
     std::vector<std::vector<at::Tensor>>& tensor_lists,
-    at::ArrayRef<double> scalars,
     T callable,
     ArgTypes... args) {
         TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth.");
         const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
-        size_t n_tensors = tensor_lists[0].size();
-        TensorListScalarListMetadata<depth> tensorListMeta;
-
-        int loc_block_info = 0;
-        int loc_tensor_info = 0;
-        for(size_t t = 0; t < n_tensors; t++) {
-
-            tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t];
-
-            tensorListMeta.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
-            for (int d = 0; d < depth; d++) {
-                tensorListMeta.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
-            }
-            loc_tensor_info++;
-
-            int chunks = (tensor_lists[0][t].numel() + kChunkSize - 1)/kChunkSize;
-            for (int chunk = 0; chunk < chunks; chunk++) {
-                tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
-                tensorListMeta.block_to_chunk[loc_block_info] = chunk;
-                loc_block_info++;
-
-                bool tensors_full = (loc_tensor_info == depth_to_max_tensors_scalarlist[depth-1] &&
-                    chunk == chunks - 1);
-                bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]);
-                bool last_chunk = (t == n_tensors - 1 && chunk == chunks - 1);
-
-                if (tensors_full || blocks_full || last_chunk) {
-                    multi_tensor_apply_kernel<<<loc_block_info, kBlockSize, 0, at::cuda::getCurrentCUDAStream()>>>(
-                        tensorListMeta,
-                        callable,
-                        args...);
-
-                    AT_CUDA_CHECK(cudaGetLastError());
-
-                    // Reset.
-                    loc_block_info = 0;
-                    if(chunk == chunks - 1) {
-                        loc_tensor_info = 0; 
-                    }
-                    else {
-                        tensorListMeta.sizes[0] = tensorListMeta.sizes[loc_tensor_info-1];
-                        tensorListMeta.scalar_vals[0] = tensorListMeta.scalar_vals[loc_tensor_info-1];
-                        for(int d = 0; d < depth; d++) {
-                            tensorListMeta.addresses[d][0] = tensorListMeta.addresses[d][loc_tensor_info-1];
-                        }
-                        loc_tensor_info = 1;
-                    }
-                }
-            }
-        }
-    }
-
 
-template<int depth, typename T, typename... ArgTypes>
-void multi_tensor_apply(
-    std::vector<std::vector<at::Tensor>>& tensor_lists,
-    T callable,
-    ArgTypes... args) {
-        TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth.");
-        const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
         size_t n_tensors = tensor_lists[0].size();
         TensorListMetadata<depth> tensorListMeta;
 
 
@@ -6187,247 +6187,160 @@
     CUDA: foreach_tensor_add_scalar_kernel_cuda
 
 - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_add_scalar_kernel_slow_
     CUDA: foreach_tensor_add_scalar_kernel_cuda_
 
 - func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_scalar_kernel_slow
     CUDA: foreach_tensor_sub_scalar_kernel_cuda
 
 - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_scalar_kernel_slow_
     CUDA: foreach_tensor_sub_scalar_kernel_cuda_
 
 - func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_scalar_kernel_slow
     CUDA: foreach_tensor_mul_scalar_kernel_cuda
 
 - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_scalar_kernel_slow_
     CUDA: foreach_tensor_mul_scalar_kernel_cuda_
 
 - func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_div_scalar_kernel_slow
     CUDA: foreach_tensor_div_scalar_kernel_cuda
 
 - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_div_scalar_kernel_slow_
     CUDA: foreach_tensor_div_scalar_kernel_cuda_
 
-- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
-  use_c10_dispatcher: full
+- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[]
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow
     CUDA: foreach_tensor_add_list_kernel_cuda
 
-- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
-  use_c10_dispatcher: full
+- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> ()
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow_
     CUDA: foreach_tensor_add_list_kernel_cuda_
 
-- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
-  use_c10_dispatcher: full
+- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[]
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_list_kernel_slow
     CUDA: foreach_tensor_sub_list_kernel_cuda
 
-- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
-  use_c10_dispatcher: full
+- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> ()
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_list_kernel_slow_
     CUDA: foreach_tensor_sub_list_kernel_cuda_
 
 - func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_list_kernel_slow
     CUDA: foreach_tensor_mul_list_kernel_cuda
 
 - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_list_kernel_slow_
     CUDA: foreach_tensor_mul_list_kernel_cuda_
 
-- func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
-  use_c10_dispatcher: full
+- func: _foreach_div.List(Tensor(a!)[] self, Tensor[] other) -> Tensor[]
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_div_list_kernel_slow
     CUDA: foreach_tensor_div_list_kernel_cuda
 
 - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_div_list_kernel_slow_
     CUDA: foreach_tensor_div_list_kernel_cuda_
 
-- func: _foreach_add.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_add_scalarlist_kernel_slow
-    CUDA: foreach_tensor_add_scalarlist_kernel_cuda
-
-- func: _foreach_add_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_add_scalarlist_kernel_slow_
-    CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
-
-- func: _foreach_sub.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_sub_scalarlist_kernel_slow
-    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
-
-- func: _foreach_sub_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_sub_scalarlist_kernel_slow_
-    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
-
-- func: _foreach_div.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_div_scalarlist_kernel_slow
-    CUDA: foreach_tensor_div_scalarlist_kernel_cuda
-
-- func: _foreach_div_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_div_scalarlist_kernel_slow_
-    CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
-
-- func: _foreach_mul.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_mul_scalarlist_kernel_slow
-    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
-
-- func: _foreach_mul_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_mul_scalarlist_kernel_slow_
-    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
-
 - func: _foreach_exp(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_exp_slow
     CUDA: foreach_tensor_exp_cuda
 
 - func: _foreach_exp_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_exp_slow_
     CUDA: foreach_tensor_exp_cuda_
 
 - func: _foreach_sqrt(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_sqrt_slow
     CUDA: foreach_tensor_sqrt_cuda
 
 - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_sqrt_slow_
     CUDA: foreach_tensor_sqrt_cuda_
 
 - func: _foreach_addcdiv_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_addcdiv_slow_
     CUDA: foreach_tensor_addcdiv_cuda_
 
 - func: _foreach_addcmul_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_addcmul_slow_
     CUDA: foreach_tensor_addcmul_cuda_
 
 - func: _foreach_addcdiv(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_addcdiv_slow
     CUDA: foreach_tensor_addcdiv_cuda
 
 - func: _foreach_addcmul(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
 
@@ -99,10 +99,6 @@
     ("preprocess", datetime.date(2020, 10, 1)),
     ("compile", datetime.date(2020, 10, 1)),
     ("execute", datetime.date(2020, 10, 1)),
-    ("aten::_foreach_add", datetime.date(2020, 10, 1)),
-    ("aten::_foreach_sub_", datetime.date(2020, 10, 1)),
-    ("aten::_foreach_div", datetime.date(2020, 10, 1)),
-    ("aten::_foreach_sub", datetime.date(2020, 10, 1)),
 ]
 
 
 
@@ -58,7 +58,7 @@ def fake_module(values, const):
         self.do_test_optional_floatlist_with_module(fake_module)
 
     def test_optional_floatlist_invalid(self):
-        with self.assertRaisesRegex(TypeError, "must be tuple of floats, not list"):
+        with self.assertRaisesRegex(TypeError, "must be .* but found"):
             FloatListWrapperModule()(torch.zeros(1), ["hi"])
 
         with self.assertRaisesRegex(RuntimeError, "value of type .* instead found type"):
 
@@ -281,7 +281,6 @@ def create_python_bindings(python_functions, is_python_method, module):
     'c10::optional<bool>': 'toBoolOptional',
     'c10::optional<double>': 'toDoubleOptional',
     'c10::optional<ArrayRef<double>>': 'doublelistOptional',
-    'ArrayRef<double>': 'doublelist',
     'IntArrayRef': 'intlist',
     'Scalar': 'scalar',
     'ScalarType': 'scalartype',
 
@@ -44,7 +44,6 @@ using at::Generator;
 using at::TensorList;
 using at::Dimname;
 using at::DimnameList;
-using at::ArrayRef;
 
 using namespace torch::autograd::utils;
 
 
@@ -304,10 +304,6 @@ def __post_init__(self) -> None:
             # TODO: fixme
             if str(self.name) not in [
                     '_amp_non_finite_check_and_unscale_',
-                    '_foreach_add_.ScalarList',
-                    '_foreach_sub_.ScalarList',
-                    '_foreach_mul_.ScalarList',
-                    '_foreach_div_.ScalarList',
                     '_foreach_add_.Scalar',
                     '_foreach_sub_.Scalar',
                     '_foreach_mul_.Scalar',
 
@@ -146,7 +146,6 @@ def type_to_python(typename, size=None):
         'Dimname': 'Union[str, ellipsis, None]',
         'DimnameList': 'Sequence[Union[str, ellipsis, None]]',
         'QScheme': '_qscheme',
-        'ArrayRef<double>' : 'Sequence[float]'
     }[typename]
 
     return typename
 
@@ -366,23 +366,6 @@ bool is_tensor_list_and_append_overloaded(PyObject* obj, std::vector<py::handle>
   return true;
 }
 
-bool is_float_list(PyObject* obj) {
-  auto tuple = six::isTuple(obj);
-  if (!(tuple || PyList_Check(obj))) {
-    return false;
-  }
-
-  auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
-  if (size > 0) { 
-    PyObject* iobj = tuple ? PyTuple_GET_ITEM(obj, 0) : PyList_GET_ITEM(obj, 0);
-    if (!THPUtils_checkDouble(iobj) && !PyComplex_Check(iobj)) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
 // argnum is needed for raising the TypeError, it's used in the error message.
 auto FunctionParameter::check(PyObject* obj, std::vector<py::handle> &overloaded_args, int argnum) -> bool
 {
@@ -437,9 +420,7 @@ auto FunctionParameter::check(PyObject* obj, std::vector<py::handle> &overloaded
       // if a size is specified (e.g. IntArrayRef[2]) we also allow passing a single int
       return size > 0 && THPUtils_checkLong(obj);
     }
-    case ParameterType::FLOAT_LIST: {
-      return is_float_list(obj);
-    }
+    case ParameterType::FLOAT_LIST: return (PyTuple_Check(obj) || PyList_Check(obj));
     case ParameterType::GENERATOR: return THPGenerator_Check(obj);
     case ParameterType::BOOL: return PyBool_Check(obj);
     case ParameterType::STORAGE: return isStorage(obj);
@@ -920,7 +901,6 @@ PythonArgs PythonArgParser::raw_parse(PyObject* self, PyObject* args, PyObject*
   print_error(self, args, kwargs, parsed_args);
 }
 
-
 void PythonArgParser::print_error(PyObject* self, PyObject* args, PyObject* kwargs, PyObject* parsed_args[]) {  // NOLINT
   auto num_args = PyTuple_GET_SIZE(args) + (kwargs ? PyDict_Size(kwargs) : 0);
   std::vector<int> plausible_idxs;
 
@@ -173,8 +173,6 @@ struct PythonArgs {
   inline c10::optional<bool> toBoolOptional(int i);
   inline c10::optional<double> toDoubleOptional(int i);
   inline c10::OptionalArray<double> doublelistOptional(int i);
-  inline std::vector<double> doublelist(int i);
-  inline std::vector<double> getDoublelist(int i);
   inline at::Layout layout(int i);
   inline at::Layout layoutWithDefault(int i, at::Layout default_layout);
   inline c10::optional<at::Layout> layoutOptional(int i);
@@ -371,7 +369,10 @@ inline c10::OptionalArray<int64_t> PythonArgs::intlistOptional(int i) {
   return intlist(i);
 }
 
-inline std::vector<double> PythonArgs::getDoublelist(int i) {
+inline c10::OptionalArray<double> PythonArgs::doublelistOptional(int i) {
+  if (!args[i]) {
+    return {};
+  }
   PyObject* arg = args[i];
   auto tuple = PyTuple_Check(arg);
   auto size = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
@@ -389,17 +390,6 @@ inline std::vector<double> PythonArgs::getDoublelist(int i) {
   return res;
 }
 
-inline c10::OptionalArray<double> PythonArgs::doublelistOptional(int i) {
-  if (!args[i]) {
-    return {};
-  }
-  return this->getDoublelist(i);
-}
-
-inline std::vector<double> PythonArgs::doublelist(int i) {
-  return this->getDoublelist(i);
-}
-
 inline at::ScalarType PythonArgs::scalartypeWithDefault(int i, at::ScalarType default_scalartype) {
   if (!args[i]) return default_scalartype;
   return scalartype(i);
Original file line number	Diff line number	Diff line change
`@@ -31,12 +31,6 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2) {`
`31`	`31`	`}`
`32`	`32`	`}`
`33`	`33`
`34`		`-void check_foreach_api_restrictions(TensorList tensors, ArrayRef<double> scalars) {`
`35`		`- TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");`
`36`		`- TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");`
`37`		`- TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");`
`38`		`-}`
`39`		`-`
`40`	`34`	`// To go via 'fast' path, several conditions must be satisfied`
`41`	`35`	`// - All tensors must be on the same device`
`42`	`36`	`// - All tensors must have strided layout`
`@@ -138,13 +132,5 @@ bool can_use_fast_route(TensorList tensors) {`
`138`	`132`	`return true;`
`139`	`133`	`}`
`140`	`134`
`141`		`-bool can_use_fast_route(TensorList tensors, ArrayRef<double> scalars) {`
`142`		`- TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");`
`143`		`- TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");`
`144`		`- TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");`
`145`		`-`
`146`		`- return can_use_fast_route(tensors);`
`147`		`-}`
`148`		`-`
`149`	`135`	`}`
`150`	`136`	`}} // at::native`
Original file line number	Diff line number	Diff line change
`@@ -99,10 +99,6 @@`
`99`	`99`	`("preprocess", datetime.date(2020, 10, 1)),`
`100`	`100`	`("compile", datetime.date(2020, 10, 1)),`
`101`	`101`	`("execute", datetime.date(2020, 10, 1)),`
`102`		`- ("aten::_foreach_add", datetime.date(2020, 10, 1)),`
`103`		`- ("aten::_foreach_sub_", datetime.date(2020, 10, 1)),`
`104`		`- ("aten::_foreach_div", datetime.date(2020, 10, 1)),`
`105`		`- ("aten::_foreach_sub", datetime.date(2020, 10, 1)),`
`106`	`102`	`]`
`107`	`103`
`108`	`104`