jacobmou · Sep 25, 2020
diff --git a/‎aten/src/ATen/native/ForeachOpsKernels.cpp
+24 b/‎aten/src/ATen/native/ForeachOpsKernels.cpp
+24
diff --git a/‎aten/src/ATen/native/ForeachUtils.h
+14 b/‎aten/src/ATen/native/ForeachUtils.h
+14
diff --git a/‎aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+60 b/‎aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+60
diff --git a/‎aten/src/ATen/native/cuda/ForeachFunctors.cuh
+115 b/‎aten/src/ATen/native/cuda/ForeachFunctors.cuh
+115
diff --git a/‎aten/src/ATen/native/cuda/MultiTensorApply.cuh
+70 b/‎aten/src/ATen/native/cuda/MultiTensorApply.cuh
+70
@@ -24,6 +24,26 @@ std::vector<Tensor> foreach_tensor_##NAME##_scalar_kernel_slow(TensorList tensor
   return result;                                                                                          \
 }
 
+#define FOREACH_BINARY_OP_SCALARLIST(NAME)                                                                              \
+void foreach_tensor_##NAME##_scalarlist_kernel_slow_(TensorList tensors, at::ArrayRef<double> scalars) {                \
+  check_foreach_api_restrictions(tensors, scalars);                                                                     \
+                                                                                                                        \
+  for (int i = 0; i < tensors.size(); i++) {                                                                            \
+      tensors[i].NAME##_(scalars[i]);                                                                                   \
+    }                                                                                                                   \
+}                                                                                                                       \
+                                                                                                                        \
+std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_kernel_slow(TensorList tensors, at::ArrayRef<double> scalars) {  \
+  check_foreach_api_restrictions(tensors, scalars);                                                                     \
+  std::vector<Tensor> result;                                                                                           \
+  result.reserve(tensors.size());                                                                                       \
+  for (int i = 0; i < tensors.size(); i++) {                                                                            \
+    result.emplace_back(tensors[i].NAME(scalars[i]));                                                                   \
+  }                                                                                                                     \
+                                                                                                                        \
+  return result;                                                                                                        \
+}
+
 #define FOREACH_BINARY_OP_LIST(NAME)                                                                      \
 std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_slow(TensorList tensors1, TensorList tensors2) {  \
   check_foreach_api_restrictions(tensors1, tensors2);                                                     \
@@ -117,6 +137,10 @@ FOREACH_BINARY_OP_SCALAR(add);
 FOREACH_BINARY_OP_SCALAR(sub);
 FOREACH_BINARY_OP_SCALAR(mul);
 FOREACH_BINARY_OP_SCALAR(div);
+FOREACH_BINARY_OP_SCALARLIST(add);
+FOREACH_BINARY_OP_SCALARLIST(sub);
+FOREACH_BINARY_OP_SCALARLIST(mul);
+FOREACH_BINARY_OP_SCALARLIST(div);
 FOREACH_BINARY_OP_LIST(mul);
 FOREACH_BINARY_OP_LIST(div);
 FOREACH_UNARY_OP(sqrt);
 
@@ -31,6 +31,12 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2) {
   }
 }
 
+void check_foreach_api_restrictions(TensorList tensors, ArrayRef<double> scalars) {
+  TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
+  TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");
+  TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");
+}
+
 // To go via 'fast' path, several conditions must be satisfied
 // - All tensors must be on the same device
 // - All tensors must have strided layout
@@ -132,5 +138,13 @@ bool can_use_fast_route(TensorList tensors) {
   return true;
 }
 
+bool can_use_fast_route(TensorList tensors, ArrayRef<double> scalars) {
+  TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
+  TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");
+  TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");
+
+  return can_use_fast_route(tensors);
+}
+
 }
 }} // at::native
@@ -0,0 +1,60 @@
+#include <ATen/Dispatch.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/cuda/ForeachFunctors.cuh>
+
+namespace at { namespace native {
+
+template<template<class> class Op>
+std::vector<Tensor> foreach_binary_op(TensorList tensors, at::ArrayRef<double> scalars) {
+    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<at::Tensor> vec_res;
+    for (const auto& t: tensors) {
+        vec_res.emplace_back(at::native::empty_like(t));
+    }
+
+    tensor_lists.emplace_back(tensors.vec());
+    tensor_lists.emplace_back(vec_res);
+
+    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda", [&]() {
+        multi_tensor_apply<2>(tensor_lists, scalars, BinaryOpScalarListFunctor<scalar_t, Op>());
+    });
+    return tensor_lists[1];
+}
+
+template<template<class> class Op>
+void foreach_binary_op_(TensorList tensors, at::ArrayRef<double> scalars) {
+    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    tensor_lists.emplace_back(tensors.vec());
+
+    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda_", [&]() {
+        multi_tensor_apply<1>(tensor_lists, scalars, BinaryOpScalarListFunctor_<scalar_t, Op>());
+    });
+}
+
+#define FOREACH_BINARY_OP_SCALARLIST(NAME, OP)                                                                           \
+void foreach_tensor_##NAME##_scalarlist_kernel_cuda_(TensorList tensors, at::ArrayRef<double> scalars) {                 \
+    check_foreach_api_restrictions(tensors);                                                                             \
+                                                                                                                         \
+    if (!can_use_fast_route(tensors, scalars)) {                                                                         \
+        return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow_(tensors, scalars);                            \
+    }                                                                                                                    \
+                                                                                                                         \
+    foreach_binary_op_<OP>(tensors, scalars);                                                                            \
+}                                                                                                                        \
+                                                                                                                         \
+std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_kernel_cuda(TensorList tensors, at::ArrayRef<double> scalars) {   \
+    check_foreach_api_restrictions(tensors);                                                                             \
+                                                                                                                         \
+    if (!can_use_fast_route(tensors, scalars)) {                                                                         \
+        return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow(tensors, scalars);                             \
+    }                                                                                                                    \
+                                                                                                                         \
+    return foreach_binary_op<OP>(tensors, scalars);                                                                      \
+}
+
+FOREACH_BINARY_OP_SCALARLIST(add, std::plus);
+FOREACH_BINARY_OP_SCALARLIST(sub, std::minus);
+FOREACH_BINARY_OP_SCALARLIST(mul, std::multiplies);
+FOREACH_BINARY_OP_SCALARLIST(div, std::divides);
+
+}} // namespace at::native
@@ -118,6 +118,121 @@ struct BinaryOpScalarFunctor {
         }
 };
 
+template<typename T, template<class> class Op>
+struct BinaryOpScalarListFunctor_ {
+    __device__ void operator() (
+        int chunk_size,
+        TensorListScalarListMetadata<1>& tl) {
+            int tensor_loc = tl.block_to_tensor[blockIdx.x];
+            int chunk_idx = tl.block_to_chunk[blockIdx.x];
+            int n = tl.sizes[tensor_loc];
+
+            T* x = (T*)tl.addresses[0][tensor_loc];
+            x += chunk_idx * chunk_size;
+
+            double y = tl.scalar_vals[tensor_loc];
+
+            n -= chunk_idx * chunk_size;
+
+            T r_x[kILP];
+
+            // to make things simple, we put aligned case in a different code path
+            if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x)) {
+                for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) {
+                    // load
+                    load_store(r_x, x, 0 , i_start);
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                    }
+                    // store
+                    load_store(x, r_x, i_start, 0);
+                }
+            }
+            else {
+                for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = 0;
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size) {
+                            r_x[ii] = x[i];
+                        }
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size)
+                            x[i] = r_x[ii];
+                    }
+                }
+            }
+        }
+};
+
+template<typename T, template<class> class Op>
+struct BinaryOpScalarListFunctor {
+    __device__ void operator() (
+        int chunk_size,
+        TensorListScalarListMetadata<2>& tl) {
+            int tensor_loc = tl.block_to_tensor[blockIdx.x];
+            int chunk_idx = tl.block_to_chunk[blockIdx.x];
+            int n = tl.sizes[tensor_loc];
+
+            T* x = (T*)tl.addresses[0][tensor_loc];
+            x += chunk_idx * chunk_size;
+
+            T* out = (T*)tl.addresses[1][tensor_loc];
+            out += chunk_idx * chunk_size;
+
+            double y = tl.scalar_vals[tensor_loc];
+
+            n -= chunk_idx * chunk_size;
+
+            T r_x[kILP];
+
+            // to make things simple, we put aligned case in a different code path
+            if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x) && is_aligned(out)) {
+                for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) {
+                    // load
+                    load_store(r_x, x, 0 , i_start);
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                    }
+                    // store
+                    load_store(out, r_x, i_start, 0);
+                }
+            }
+            else {
+                for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = 0;
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size) {
+                            r_x[ii] = x[i];
+                        }
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size)
+                            out[i] = r_x[ii];
+                    }
+                }
+            }
+        }
+};
+
 template<typename T, template<class> class Op>
 struct BinaryOpListAlphaFunctor_ {
     __device__ void operator() (
 
@@ -26,6 +26,7 @@ __device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int s
 // TensorListMetadata has to be < 4KB - the limit for kernel launch argument
 static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
 static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30};
 
 template<int n> struct TensorListMetadata
 {
@@ -35,6 +36,15 @@ template<int n> struct TensorListMetadata
   int block_to_chunk[depth_to_max_blocks[n-1]];
 };
 
+template<int n> struct TensorListScalarListMetadata
+{
+  void* addresses[n][depth_to_max_tensors_scalarlist[n-1]];
+  int sizes[depth_to_max_tensors_scalarlist[n-1]];
+  double scalar_vals[depth_to_max_tensors_scalarlist[n-1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n-1]];
+  int block_to_chunk[depth_to_max_blocks[n-1]];
+};
+
 template<typename T, typename U, typename... ArgTypes>
 C10_LAUNCH_BOUNDS_1(kBlockSize)
 __global__ void 
@@ -49,11 +59,71 @@ multi_tensor_apply_kernel(
 template<int depth, typename T, typename... ArgTypes>
 void multi_tensor_apply(
     std::vector<std::vector<at::Tensor>>& tensor_lists,
+    at::ArrayRef<double> scalars,
     T callable,
     ArgTypes... args) {
         TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth.");
         const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
+        size_t n_tensors = tensor_lists[0].size();
+        TensorListScalarListMetadata<depth> tensorListMeta;
+
+        int loc_block_info = 0;
+        int loc_tensor_info = 0;
+        for(size_t t = 0; t < n_tensors; t++) {
+
+            tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t];
+
+            tensorListMeta.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+            for (int d = 0; d < depth; d++) {
+                tensorListMeta.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+            }
+            loc_tensor_info++;
+
+            int chunks = (tensor_lists[0][t].numel() + kChunkSize - 1)/kChunkSize;
+            for (int chunk = 0; chunk < chunks; chunk++) {
+                tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+                tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+                loc_block_info++;
+
+                bool tensors_full = (loc_tensor_info == depth_to_max_tensors_scalarlist[depth-1] &&
+                    chunk == chunks - 1);
+                bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]);
+                bool last_chunk = (t == n_tensors - 1 && chunk == chunks - 1);
+
+                if (tensors_full || blocks_full || last_chunk) {
+                    multi_tensor_apply_kernel<<<loc_block_info, kBlockSize, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        tensorListMeta,
+                        callable,
+                        args...);
+
+                    AT_CUDA_CHECK(cudaGetLastError());
+
+                    // Reset.
+                    loc_block_info = 0;
+                    if(chunk == chunks - 1) {
+                        loc_tensor_info = 0; 
+                    }
+                    else {
+                        tensorListMeta.sizes[0] = tensorListMeta.sizes[loc_tensor_info-1];
+                        tensorListMeta.scalar_vals[0] = tensorListMeta.scalar_vals[loc_tensor_info-1];
+                        for(int d = 0; d < depth; d++) {
+                            tensorListMeta.addresses[d][0] = tensorListMeta.addresses[d][loc_tensor_info-1];
+                        }
+                        loc_tensor_info = 1;
+                    }
+                }
+            }
+        }
+    }
+
 
+template<int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    T callable,
+    ArgTypes... args) {
+        TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth.");
+        const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
         size_t n_tensors = tensor_lists[0].size();
         TensorListMetadata<depth> tensorListMeta;
Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,12 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2) {`
`31`	`31`	`}`
`32`	`32`	`}`
`33`	`33`
	`34`	`+void check_foreach_api_restrictions(TensorList tensors, ArrayRef<double> scalars) {`
	`35`	`+ TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");`
	`36`	`+ TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");`
	`37`	`+ TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");`
	`38`	`+}`
	`39`	`+`
`34`	`40`	`// To go via 'fast' path, several conditions must be satisfied`
`35`	`41`	`// - All tensors must be on the same device`
`36`	`42`	`// - All tensors must have strided layout`
`@@ -132,5 +138,13 @@ bool can_use_fast_route(TensorList tensors) {`
`132`	`138`	`return true;`
`133`	`139`	`}`
`134`	`140`
	`141`	`+bool can_use_fast_route(TensorList tensors, ArrayRef<double> scalars) {`
	`142`	`+ TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");`
	`143`	`+ TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");`
	`144`	`+ TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");`
	`145`	`+`
	`146`	`+ return can_use_fast_route(tensors);`
	`147`	`+}`
	`148`	`+`
`135`	`149`	`}`
`136`	`150`	`}} // at::native`