jacobmou · Jun 10, 2020
diff --git a/‎aten/src/ATen/native/BinaryOps.cpp
+18-7 b/‎aten/src/ATen/native/BinaryOps.cpp
+18-7
diff --git a/‎aten/src/ATen/native/Copy.cpp
+2-1 b/‎aten/src/ATen/native/Copy.cpp
+2-1
diff --git a/‎aten/src/ATen/native/ReduceOpsUtils.h
+3-3 b/‎aten/src/ATen/native/ReduceOpsUtils.h
+3-3
diff --git a/‎aten/src/ATen/native/TensorAdvancedIndexing.cpp
+12-8 b/‎aten/src/ATen/native/TensorAdvancedIndexing.cpp
+12-8
diff --git a/‎aten/src/ATen/native/TensorCompare.cpp
+1-1 b/‎aten/src/ATen/native/TensorCompare.cpp
+1-1
@@ -111,23 +111,34 @@ Tensor& remainder_(Tensor& self, const Tensor& other) {
 }
 
 Tensor& true_divide_out(Tensor& result, const Tensor& self, const Tensor& divisor) {
-  TORCH_CHECK(!isIntegralType(result.scalar_type(), /*includeBool=*/ true),
-            "True division requires a floating output type, but got ",
-            result.scalar_type());
+  // If both inputs have integral (or bool) types, creates
+  // temporary float copies as new inputs.
+  if (isIntegralType(self.scalar_type(), /*includeBool=*/ true)
+   && isIntegralType(divisor.scalar_type(), /*includeBool=*/ true)) {
+    const auto scalar_type = typeMetaToScalarType(c10::get_default_dtype());
+    auto iter = TensorIterator::binary_op(result,
+                                          self.to(scalar_type),
+                                          divisor.to(scalar_type),
+                                          /*check_mem_overlap=*/ true);
+    div_stub(iter.device_type(), iter);
+    return result;
+  }
   auto iter = TensorIterator::binary_op(result, self, divisor, /*check_mem_overlap=*/ true);
   div_stub(iter.device_type(), iter);
   return result;
 }
 
 Tensor true_divide(const Tensor& self, const Tensor& divisor) {
-  // If both inputs have integral (or bool) types, sets the output to have
-  // the default (floating) scalar type
+  // If both inputs have integral (or bool) types, creates
+  // temporary float copies as new inputs and sets the result's type to
+  // the default scalar type
   if (isIntegralType(self.scalar_type(), /*includeBool=*/ true)
    && isIntegralType(divisor.scalar_type(), /*includeBool=*/ true)) {
     const auto scalar_type = typeMetaToScalarType(c10::get_default_dtype());
     Tensor result = at::empty({0}, self.options().dtype(scalar_type));
-
-    auto iter = TensorIterator::binary_op(result, self, divisor);
+    auto iter = TensorIterator::binary_op(result,
+                                          self.to(scalar_type),
+                                          divisor.to(scalar_type));
     div_stub(iter.device_type(), iter);
     return result;
   }
 
@@ -140,7 +140,8 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
   iter.add_output(self);
   iter.add_input(src);
   iter.dont_resize_outputs();
-  iter.dont_compute_common_dtype();
+  iter.check_all_same_dtype(false);
+  iter.check_all_same_device(false);
   iter.build();
 
   if (iter.numel() == 0) {
 
@@ -212,7 +212,7 @@ static TensorIterator make_reduction(
 
 static TensorIterator make_reduction(
     const char* name, Tensor& result1, Tensor& result2, const Tensor& self, IntArrayRef dim,
-    bool keepdim, ScalarType dtype1, ScalarType dtype2, bool promote_gpu_output_dtypes=true)
+    bool keepdim, ScalarType dtype1, ScalarType dtype2)
 {
   // check that result type and dtype match if provided
   TORCH_CHECK(
@@ -240,9 +240,9 @@ static TensorIterator make_reduction(
   // product of templated kernel launches.
   if (self.scalar_type() == dtype1 ||
       (self.is_cuda() && self.scalar_type() == kHalf && dtype1 == kFloat)) {
-    return TensorIterator::reduce_op(viewed_result1, viewed_result2, self, promote_gpu_output_dtypes);
+    return TensorIterator::reduce_op(viewed_result1, viewed_result2, self);
   }
-  return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype1), promote_gpu_output_dtypes);
+  return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype1));
 }
 
 static TensorIterator make_reduction(
 
@@ -215,11 +215,15 @@ static AdvancedIndex make_info(Tensor self, TensorList orig) {
 static TensorIterator make_index_put_iterator(const AdvancedIndex& info, const Tensor& value) {
   TORCH_CHECK(is_expandable_to(value.sizes(), info.src.sizes()), "shape mismatch: value tensor of shape ", value.sizes(),
              " cannot be broadcast to indexing result of shape ", info.src.sizes());
+  TORCH_CHECK(value.scalar_type() == info.src.scalar_type(),
+              "Index put requires the source and destination dtypes match, "
+              "got ", info.src.scalar_type(), " for the destination "
+              "and ", value.scalar_type(), " for the source.");
   auto iter = TensorIterator();
-  iter.dont_compute_common_dtype();
   iter.dont_resize_outputs();
+  iter.check_all_same_dtype(false);
   iter.add_output(info.src);
-  iter.add_input(value, info.src.device(), info.src.scalar_type());
+  iter.add_input(value);
   for (auto& index : info.indices) {
     iter.add_input(index);
   }
@@ -229,7 +233,7 @@ static TensorIterator make_index_put_iterator(const AdvancedIndex& info, const T
 
 static TensorIterator make_index_iterator(const AdvancedIndex& info) {
   auto iter = TensorIterator();
-  iter.dont_compute_common_dtype();
+  iter.check_all_same_dtype(false);
   iter.add_output(Tensor(), info.src.device(), info.src.scalar_type());
   iter.add_input(info.src);
   for (auto& index : info.indices) {
@@ -241,7 +245,7 @@ static TensorIterator make_index_iterator(const AdvancedIndex& info) {
 
 static TensorIterator make_index_out_iterator(const AdvancedIndex& info, Tensor& result) {
   auto iter = TensorIterator();
-  iter.dont_compute_common_dtype();
+  iter.check_all_same_dtype(false);
   iter.add_output(result, info.src.device(), info.src.scalar_type());
   iter.add_input(info.src);
   for (auto& index : info.indices) {
@@ -437,7 +441,7 @@ Tensor & index_select_out_cpu_(Tensor & result, const Tensor & self, int64_t dim
     auto slice_size = selfSlice.numel();
 
     auto iter = TensorIterator();
-    iter.dont_compute_common_dtype();
+    iter.check_all_same_dtype(false);
     iter.dont_resize_outputs();
     iter.add_output(resultSlice);
     iter.add_input(selfSlice);
@@ -571,7 +575,7 @@ static Tensor & masked_fill_impl_cpu(Tensor & self, const Tensor & mask, Scalar
   }
 
   auto iter = TensorIterator();
-  iter.dont_compute_common_dtype();
+  iter.check_all_same_dtype(false);
   iter.dont_resize_outputs();
   iter.add_output(self);
   iter.add_input(mask);
@@ -658,7 +662,7 @@ static Tensor & masked_select_out_impl_cpu(Tensor & result, const Tensor & self,
   bool use_serial_kernel = self.numel() < at::internal::GRAIN_SIZE || at::get_num_threads() == 1;
   if (use_serial_kernel) {
     auto iter = TensorIterator();
-    iter.dont_compute_common_dtype();
+    iter.check_all_same_dtype(false);
     iter.dont_resize_outputs();
     iter.add_output(result_strided);
     iter.add_input(_self);
@@ -681,7 +685,7 @@ static Tensor & masked_select_out_impl_cpu(Tensor & result, const Tensor & self,
   std::partial_sum(mask_long_data, mask_long_data + mask_long.numel(), mask_prefix_sum_data);
 
   auto iter = TensorIterator();
-  iter.dont_compute_common_dtype();
+  iter.check_all_same_dtype(false);
   iter.dont_resize_outputs();
   iter.add_output(result_strided);
   iter.add_input(_self);
 
@@ -151,12 +151,12 @@ Tensor _s_where(const Tensor& condition, const Tensor& self, const Tensor& other
   TORCH_CHECK(self.dtype() == other.dtype(), "expected scalar type ", self.dtype(), " but found ", other.dtype());
   Tensor ret = at::empty(self.sizes(), self.options());
   auto iter = at::TensorIterator();
+  iter.check_all_same_dtype(false);
   iter.set_check_mem_overlap(true);
   iter.add_output(ret);
   iter.add_input(condition);
   iter.add_input(self);
   iter.add_input(other);
-  iter.dont_compute_common_dtype();
   iter.build();
   where_kernel(iter.device_type(), iter, condition.scalar_type());
   return ret;
Original file line number	Diff line number	Diff line change
`@@ -212,7 +212,7 @@ static TensorIterator make_reduction(`
`212`	`212`
`213`	`213`	`static TensorIterator make_reduction(`
`214`	`214`	`const char* name, Tensor& result1, Tensor& result2, const Tensor& self, IntArrayRef dim,`
`215`		`- bool keepdim, ScalarType dtype1, ScalarType dtype2, bool promote_gpu_output_dtypes=true)`
	`215`	`+ bool keepdim, ScalarType dtype1, ScalarType dtype2)`
`216`	`216`	`{`
`217`	`217`	`// check that result type and dtype match if provided`
`218`	`218`	`TORCH_CHECK(`
`@@ -240,9 +240,9 @@ static TensorIterator make_reduction(`
`240`	`240`	`// product of templated kernel launches.`
`241`	`241`	`if (self.scalar_type() == dtype1 \|\|`
`242`	`242`	`(self.is_cuda() && self.scalar_type() == kHalf && dtype1 == kFloat)) {`
`243`		`- return TensorIterator::reduce_op(viewed_result1, viewed_result2, self, promote_gpu_output_dtypes);`
	`243`	`+ return TensorIterator::reduce_op(viewed_result1, viewed_result2, self);`
`244`	`244`	`}`
`245`		`- return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype1), promote_gpu_output_dtypes);`
	`245`	`+ return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype1));`
`246`	`246`	`}`
`247`	`247`
`248`	`248`	`static TensorIterator make_reduction(`