@@ -176,7 +176,7 @@ VBuffer kernelNCHW_OCHW_repack_O4C4HWi4o4(
176
176
}
177
177
178
178
VBuffer bufferFromOptionalHostData (
179
- c10::optional<float *> data,
179
+ c10::optional<const float *> data,
180
180
const uint32_t size) {
181
181
const auto sizeAligned =
182
182
ROUND_UP (size, context ().limits ().minStorageBufferOffsetAlignment );
@@ -202,17 +202,15 @@ uint32_t conv2d_biasBufferSize(uint32_t oc) {
202
202
void conv2d_depthwise (
203
203
VulkanTensor& output,
204
204
const VulkanTensor& input,
205
- const float * weight,
206
- const c10::optional< float *> bias ,
207
- const Conv2DParams params,
205
+ const VulkanTensor& weight,
206
+ const VBuffer& biasBuffer ,
207
+ const Conv2DParams& params,
208
208
c10::optional<float > output_min,
209
209
c10::optional<float > output_max) {
210
210
TORCH_INTERNAL_ASSERT (params.G == params.C );
211
211
auto osizes = output.sizes ();
212
212
TORCH_INTERNAL_ASSERT (osizes[2 ] == params.OH );
213
213
TORCH_INTERNAL_ASSERT (osizes[3 ] == params.OW );
214
- auto biasBuffer =
215
- bufferFromOptionalHostData (bias, conv2d_biasBufferSize (params.OC ));
216
214
struct ConstBlock {
217
215
int32_t padding[2 ];
218
216
int32_t kernelSize[2 ];
@@ -234,9 +232,6 @@ void conv2d_depthwise(
234
232
output_max ? *output_max : std::numeric_limits<float >::infinity ()};
235
233
VBuffer constBuffer = makeUniformConstBuffer ((void *)&cb, sizeof (cb));
236
234
237
- VulkanTensor kernel{{params.OC , params.KH , params.KW }};
238
- kernel.set_data_from_host (weight);
239
-
240
235
VkDescriptorSetLayout descriptorSetLayout{};
241
236
VkDescriptorPool descriptorPool{};
242
237
VkDescriptorSet descriptorSet{};
@@ -256,7 +251,7 @@ void conv2d_depthwise(
256
251
257
252
output.image ()->bindStorageImage (descriptorSet, 0 );
258
253
input.image ()->bindShaderRead (descriptorSet, 1 );
259
- kernel .image ()->bindShaderRead (descriptorSet, 2 );
254
+ weight .image ()->bindShaderRead (descriptorSet, 2 );
260
255
biasBuffer.bind (descriptorSet, 3 );
261
256
constBuffer.bind (descriptorSet, 4 );
262
257
@@ -269,7 +264,7 @@ void conv2d_depthwise(
269
264
auto commandBuffer = computeUnit.commandBuffer ();
270
265
output.image ()->addImageMemoryBarrierToGeneral (commandBuffer);
271
266
input.image ()->addImageMemoryBarrierToShaderRead (commandBuffer);
272
- kernel .image ()->addImageMemoryBarrierToShaderRead (commandBuffer);
267
+ weight .image ()->addImageMemoryBarrierToShaderRead (commandBuffer);
273
268
computeUnit.dispatchCommandBuffer (
274
269
params.OW , params.OH , params.OC_4 , workGroupSize);
275
270
computeUnit.endCommandBuffer ();
@@ -279,6 +274,44 @@ void conv2d_depthwise(
279
274
vkDestroyDescriptorSetLayout (device, descriptorSetLayout, nullptr );
280
275
}
281
276
277
+ void conv2d_depthwise (
278
+ VulkanTensor& output,
279
+ const VulkanTensor& input,
280
+ const VulkanTensor& weight,
281
+ const c10::optional<const float *> bias,
282
+ const Conv2DParams params,
283
+ c10::optional<float > output_min,
284
+ c10::optional<float > output_max) {
285
+ conv2d_depthwise (
286
+ output,
287
+ input,
288
+ weight,
289
+ bufferFromOptionalHostData (bias, conv2d_biasBufferSize (params.OC )),
290
+ params,
291
+ output_min,
292
+ output_max);
293
+ }
294
+
295
+ void conv2d_depthwise (
296
+ VulkanTensor& output,
297
+ const VulkanTensor& input,
298
+ const float * weight,
299
+ const c10::optional<const float *> bias,
300
+ const Conv2DParams params,
301
+ c10::optional<float > output_min,
302
+ c10::optional<float > output_max) {
303
+ VulkanTensor weightTensor{{params.OC , params.KH , params.KW }};
304
+ weightTensor.set_data_from_host (weight);
305
+ conv2d_depthwise (
306
+ output,
307
+ input,
308
+ weightTensor,
309
+ bufferFromOptionalHostData (bias, conv2d_biasBufferSize (params.OC )),
310
+ params,
311
+ output_min,
312
+ output_max);
313
+ }
314
+
282
315
ImageSizes conv2d_prepack_weights_image_sizes (
283
316
int64_t OC,
284
317
int64_t C,
@@ -463,7 +496,7 @@ void conv2d(
463
496
VulkanTensor& output,
464
497
const VulkanTensor& input,
465
498
const VImage& kernelImage,
466
- const c10::optional<float *> bias,
499
+ const c10::optional<const float *> bias,
467
500
const Conv2DParams& params,
468
501
c10::optional<float > output_min,
469
502
c10::optional<float > output_max) {
@@ -483,10 +516,22 @@ void conv2d(
483
516
VulkanTensor& output,
484
517
const VulkanTensor& input,
485
518
const VulkanTensor& weight_prepacked,
486
- c10::optional<float *> bias,
519
+ c10::optional<const float *> bias,
487
520
const Conv2DParams params,
488
521
c10::optional<float > output_min,
489
522
c10::optional<float > output_max) {
523
+ if (params.G > 1 ) {
524
+ conv2d_depthwise (
525
+ output,
526
+ input,
527
+ weight_prepacked,
528
+ bufferFromOptionalHostData (bias, conv2d_biasBufferSize (params.OC )),
529
+ params,
530
+ output_min,
531
+ output_max);
532
+ return ;
533
+ }
534
+
490
535
conv2d (
491
536
output,
492
537
input,
@@ -505,6 +550,18 @@ void conv2d(
505
550
const Conv2DParams params,
506
551
c10::optional<float > output_min,
507
552
c10::optional<float > output_max) {
553
+ if (params.G > 1 ) {
554
+ conv2d_depthwise (
555
+ output,
556
+ input,
557
+ weight_prepacked,
558
+ *(bias.buffer ()),
559
+ params,
560
+ output_min,
561
+ output_max);
562
+ return ;
563
+ }
564
+
508
565
conv2d (
509
566
output,
510
567
input,
@@ -519,7 +576,7 @@ void conv2d(
519
576
VulkanTensor& output,
520
577
const VulkanTensor& input,
521
578
const float * weight,
522
- const c10::optional<float *> bias,
579
+ const c10::optional<const float *> bias,
523
580
const Conv2DParams params,
524
581
c10::optional<float > output_min,
525
582
c10::optional<float > output_max) {
0 commit comments