5
5
#include " caffe/common_layers.hpp"
6
6
#include " caffe/filler.hpp"
7
7
#include " caffe/layer.hpp"
8
+ #include " caffe/util/gpu_util.cuh"
8
9
#include " caffe/util/math_functions.hpp"
9
10
10
11
namespace caffe {
@@ -22,18 +23,21 @@ __global__ void EmbedForward(const int nthreads, const Dtype* bottom_data,
22
23
}
23
24
}
24
25
26
+ template <typename Dtype>
27
+ __global__ void EmbedBackward (const int nthreads, const Dtype* bottom_data,
28
+ const Dtype* top_diff, const int M, const int N, const int K,
29
+ Dtype* weight_diff);
30
+
25
31
template <typename Dtype>
26
32
__global__ void EmbedBackward (const int nthreads, const Dtype* bottom_data,
27
33
const Dtype* top_diff, const int M, const int N, const int K,
28
34
Dtype* weight_diff) {
29
- CUDA_KERNEL_LOOP (weight_index, nthreads) {
30
- const int index = weight_index / N;
31
- const int output_index = weight_index % N;
32
- for (int n = 0 ; n < M; ++n) {
33
- if (static_cast <int >(bottom_data[n]) == index ) {
34
- weight_diff[weight_index] += top_diff[n * N + output_index];
35
- }
36
- }
35
+ CUDA_KERNEL_LOOP (top_index, nthreads) {
36
+ const int n = top_index / N;
37
+ const int d = top_index % N;
38
+ const int index = static_cast <int >(bottom_data[n]);
39
+ const int weight_index = index * N + d;
40
+ caffe_gpu_atomic_add (top_diff[top_index], weight_diff + weight_index);
37
41
}
38
42
}
39
43
@@ -59,13 +63,14 @@ void EmbedLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
59
63
const vector<bool >& propagate_down, const vector<Blob<Dtype>*>& bottom) {
60
64
CHECK (!propagate_down[0 ]) << " Can't backpropagate to EmbedLayer input." ;
61
65
if (this ->param_propagate_down_ [0 ]) {
66
+ const int top_count = top[0 ]->count ();
62
67
const int count = this ->blobs_ [0 ]->count ();
63
68
const Dtype* top_diff = top[0 ]->gpu_diff ();
64
69
const Dtype* bottom_data = bottom[0 ]->gpu_data ();
65
70
Dtype* weight_diff = this ->blobs_ [0 ]->mutable_gpu_diff ();
66
71
EmbedBackward<Dtype> // NOLINT_NEXT_LINE(whitespace/operators)
67
- <<<CAFFE_GET_BLOCKS(count ), CAFFE_CUDA_NUM_THREADS>>> (
68
- count , bottom_data, top_diff, M_, N_, K_, weight_diff);
72
+ <<<CAFFE_GET_BLOCKS(top_count ), CAFFE_CUDA_NUM_THREADS>>> (
73
+ top_count , bottom_data, top_diff, M_, N_, K_, weight_diff);
69
74
}
70
75
if (bias_term_ && this ->param_propagate_down_ [1 ]) {
71
76
const Dtype* top_diff = top[0 ]->gpu_diff ();
0 commit comments