@@ -246,33 +246,27 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te
246
246
// -----------------------------------
247
247
void hardshrink_kernel (TensorIterator& iter, Scalar value) {
248
248
AT_DISPATCH_FLOATING_TYPES_AND2 (at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype (), " hardshrink_cuda" , [&]() {
249
- AT_SKIP_BFLOAT16_IF_NOT_ROCM (scalar_t , " hardshrink_cuda" , [&] {
250
- auto lambd = value.to <scalar_t >();
251
- gpu_kernel (iter, [lambd]GPU_LAMBDA (scalar_t a) -> scalar_t {
252
- return (a >= -lambd && a <= lambd) ? scalar_t (0 ) : a;
253
- });
249
+ auto lambd = value.to <scalar_t >();
250
+ gpu_kernel (iter, [lambd]GPU_LAMBDA (scalar_t a) -> scalar_t {
251
+ return (a >= -lambd && a <= lambd) ? scalar_t (0 ) : a;
254
252
});
255
253
});
256
254
}
257
255
258
256
void softshrink_kernel (TensorIterator& iter, Scalar value) {
259
257
AT_DISPATCH_FLOATING_TYPES_AND2 (at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype (), " softshrink_cuda" , [&]() {
260
- AT_SKIP_BFLOAT16_IF_NOT_ROCM (scalar_t , " softshrink_cuda" , [&] {
261
- auto lambd = value.to <scalar_t >();
262
- gpu_kernel (iter, [lambd]GPU_LAMBDA (scalar_t a) -> scalar_t {
263
- return a > lambd ? a - lambd : (a < -lambd ? a + lambd : scalar_t (0 ));
264
- });
258
+ auto lambd = value.to <scalar_t >();
259
+ gpu_kernel (iter, [lambd]GPU_LAMBDA (scalar_t a) -> scalar_t {
260
+ return a > lambd ? a - lambd : (a < -lambd ? a + lambd : scalar_t (0 ));
265
261
});
266
262
});
267
263
}
268
264
269
265
void shrink_backward_kernel (TensorIterator& iter, Scalar value) {
270
266
AT_DISPATCH_FLOATING_TYPES_AND2 (at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype (), " shrink_backward_cuda" , [&]() {
271
- AT_SKIP_BFLOAT16_IF_NOT_ROCM (scalar_t , " shrink_backward_cuda" , [&] {
272
- auto lambd = value.to <scalar_t >();
273
- gpu_kernel (iter, [lambd]GPU_LAMBDA (scalar_t grad_val, scalar_t self_val) -> scalar_t {
274
- return (self_val >= -lambd && self_val <= lambd) ? scalar_t (0 ) : grad_val;
275
- });
267
+ auto lambd = value.to <scalar_t >();
268
+ gpu_kernel (iter, [lambd]GPU_LAMBDA (scalar_t grad_val, scalar_t self_val) -> scalar_t {
269
+ return (self_val >= -lambd && self_val <= lambd) ? scalar_t (0 ) : grad_val;
276
270
});
277
271
});
278
272
}
@@ -289,25 +283,21 @@ void hardtanh_backward_kernel(TensorIterator& iter, Scalar min, Scalar max) {
289
283
290
284
void softplus_kernel (TensorIterator& iter, Scalar beta_, Scalar threshold_) {
291
285
AT_DISPATCH_FLOATING_TYPES_AND2 (at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype (), " softplus_cuda" , [&]() {
292
- AT_SKIP_BFLOAT16_IF_NOT_ROCM (scalar_t , " softplus_cuda" , [&] {
293
- auto beta = beta_.to <scalar_t >();
294
- auto threshold = threshold_.to <scalar_t >();
295
- gpu_kernel (iter, [beta, threshold]GPU_LAMBDA (scalar_t a) -> scalar_t {
296
- return (a * beta) > threshold ? a : static_cast <scalar_t >(::log1p (std::exp (a * beta))) / beta;
297
- });
286
+ auto beta = beta_.to <scalar_t >();
287
+ auto threshold = threshold_.to <scalar_t >();
288
+ gpu_kernel (iter, [beta, threshold]GPU_LAMBDA (scalar_t a) -> scalar_t {
289
+ return (a * beta) > threshold ? a : static_cast <scalar_t >(::log1p (std::exp (a * beta))) / beta;
298
290
});
299
291
});
300
292
}
301
293
302
294
void softplus_backward_kernel (TensorIterator& iter, Scalar beta_, Scalar threshold_) {
303
295
AT_DISPATCH_FLOATING_TYPES_AND2 (at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype (), " softplus_backward_cuda" , [&]() {
304
- AT_SKIP_BFLOAT16_IF_NOT_ROCM (scalar_t , " softplus_backward_cuda" , [&] {
305
- auto beta = beta_.to <scalar_t >();
306
- auto threshold = threshold_.to <scalar_t >();
307
- gpu_kernel (iter, [beta, threshold]GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
308
- scalar_t z = std::exp (b * beta);
309
- return (b * beta) > threshold ? a : a * (z - scalar_t (1 .)) / z;
310
- });
296
+ auto beta = beta_.to <scalar_t >();
297
+ auto threshold = threshold_.to <scalar_t >();
298
+ gpu_kernel (iter, [beta, threshold]GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
299
+ scalar_t z = std::exp (b * beta);
300
+ return (b * beta) > threshold ? a : a * (z - scalar_t (1 .)) / z;
311
301
});
312
302
});
313
303
}
@@ -321,34 +311,28 @@ void threshold_kernel_impl(TensorIterator& iter, scalar_t threshold, scalar_t va
321
311
322
312
static void threshold_kernel (TensorIterator& iter, Scalar threshold, Scalar value) {
323
313
AT_DISPATCH_ALL_TYPES_AND2 (at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype (), " threshold_cuda" , [&] {
324
- AT_SKIP_BFLOAT16_IF_NOT_ROCM (scalar_t , " threshold_cuda" , [&] {
325
- threshold_kernel_impl<scalar_t >(iter, threshold.to <scalar_t >(), value.to <scalar_t >());
326
- });
314
+ threshold_kernel_impl<scalar_t >(iter, threshold.to <scalar_t >(), value.to <scalar_t >());
327
315
});
328
316
}
329
317
330
318
void elu_kernel (TensorIterator& iter, Scalar alpha, Scalar scale, Scalar input_scale) {
331
319
AT_DISPATCH_FLOATING_TYPES_AND2 (at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype (), " elu_cuda" , [&]() {
332
- AT_SKIP_BFLOAT16_IF_NOT_ROCM (scalar_t , " elu_cuda" , [&] {
333
- auto negcoef = alpha.to <scalar_t >() * scale.to <scalar_t >();
334
- auto poscoef = scale.to <scalar_t >();
335
- auto negiptcoef = input_scale.to <scalar_t >();
336
- gpu_kernel (iter, [negcoef, poscoef, negiptcoef]GPU_LAMBDA (scalar_t a) -> scalar_t {
337
- return a > scalar_t (0 ) ? a * poscoef : (static_cast <scalar_t >(std::exp (a * negiptcoef)) - scalar_t (1 .)) * negcoef;
338
- });
320
+ auto negcoef = alpha.to <scalar_t >() * scale.to <scalar_t >();
321
+ auto poscoef = scale.to <scalar_t >();
322
+ auto negiptcoef = input_scale.to <scalar_t >();
323
+ gpu_kernel (iter, [negcoef, poscoef, negiptcoef]GPU_LAMBDA (scalar_t a) -> scalar_t {
324
+ return a > scalar_t (0 ) ? a * poscoef : (static_cast <scalar_t >(std::exp (a * negiptcoef)) - scalar_t (1 .)) * negcoef;
339
325
});
340
326
});
341
327
}
342
328
343
329
void elu_backward_kernel (TensorIterator& iter, Scalar alpha, Scalar scale, Scalar input_scale) {
344
330
AT_DISPATCH_FLOATING_TYPES_AND2 (at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype (), " elu_backward_cuda" , [&]() {
345
- AT_SKIP_BFLOAT16_IF_NOT_ROCM (scalar_t , " elu_backward_cuda" , [&] {
346
- auto negcoef = alpha.to <scalar_t >() * scale.to <scalar_t >();
347
- auto poscoef = scale.to <scalar_t >();
348
- auto negiptcoef = input_scale.to <scalar_t >();
349
- gpu_kernel (iter, [negcoef, poscoef, negiptcoef]GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
350
- return b <= scalar_t (0 ) ? a * negiptcoef * (b + negcoef) : a * poscoef;
351
- });
331
+ auto negcoef = alpha.to <scalar_t >() * scale.to <scalar_t >();
332
+ auto poscoef = scale.to <scalar_t >();
333
+ auto negiptcoef = input_scale.to <scalar_t >();
334
+ gpu_kernel (iter, [negcoef, poscoef, negiptcoef]GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
335
+ return b <= scalar_t (0 ) ? a * negiptcoef * (b + negcoef) : a * poscoef;
352
336
});
353
337
});
354
338
}
@@ -387,22 +371,18 @@ void GeluBackwardCUDAKernelImpl(TensorIterator& it) {
387
371
388
372
void leaky_relu_kernel (TensorIterator& iter, Scalar negval_) {
389
373
AT_DISPATCH_FLOATING_TYPES_AND2 (at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype (), " leaky_relu_cuda" , [&]() {
390
- AT_SKIP_BFLOAT16_IF_NOT_ROCM (scalar_t , " leaky_relu_cuda" , [&] {
391
- auto negval = negval_.to <scalar_t >();
392
- gpu_kernel (iter, [negval]GPU_LAMBDA (scalar_t a) -> scalar_t {
393
- return a > scalar_t (0 ) ? a : a * negval;
394
- });
374
+ auto negval = negval_.to <scalar_t >();
375
+ gpu_kernel (iter, [negval]GPU_LAMBDA (scalar_t a) -> scalar_t {
376
+ return a > scalar_t (0 ) ? a : a * negval;
395
377
});
396
378
});
397
379
}
398
380
399
381
void leaky_relu_backward_kernel (TensorIterator& iter, Scalar negval_) {
400
382
AT_DISPATCH_FLOATING_TYPES_AND2 (at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype (), " leaky_relu_backward_cuda" , [&]() {
401
- AT_SKIP_BFLOAT16_IF_NOT_ROCM (scalar_t , " leaky_relu_backward_cuda" , [&] {
402
- auto negval = negval_.to <scalar_t >();
403
- gpu_kernel (iter, [negval]GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
404
- return a > scalar_t (0 ) ? b : b * negval;
405
- });
383
+ auto negval = negval_.to <scalar_t >();
384
+ gpu_kernel (iter, [negval]GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
385
+ return a > scalar_t (0 ) ? b : b * negval;
406
386
});
407
387
});
408
388
}
0 commit comments