diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index f2ab4c5d69582..7362a992e134e 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3110,17 +3110,17 @@ static void ggml_compute_forward_dup_same_cont( const int ith = params->ith; // thread index const int nth = params->nth; // number of threads - // parallelize by elements - const int ne = ggml_nelements(dst); - const int dr = (ne + nth - 1) / nth; - const int ie0 = dr * ith; - const int ie1 = MIN(ie0 + dr, ne); + // parallelize by blocks + const int nk = ggml_nelements(src0)/ggml_blck_size(src0->type); + const int dr = (nk + nth - 1) / nth; + const int k0 = dr * ith; + const int k1 = MIN(k0 + dr, nk); - if (ie0 < ie1) { + if (k0 < k1) { memcpy( - ((char *) dst->data + ie0*nb0), - ((char *) src0->data + ie0*nb0), - (ie1 - ie0) * nb0); + ((char *) dst->data + k0*nb0), + ((char *) src0->data + k0*nb0), + (k1 - k0) * nb0); } } @@ -4055,7 +4055,6 @@ static void ggml_compute_forward_dup_f32( static void ggml_compute_forward_dup_bytes( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); @@ -4069,10 +4068,10 @@ static void ggml_compute_forward_dup_bytes( } const size_t type_size = ggml_type_size(src0->type); + const int ith = params->ith; // thread index const int nth = params->nth; // number of threads - // parallelize by rows const int nr = ne01; // number of rows per thread @@ -4082,10 +4081,10 @@ static void ggml_compute_forward_dup_bytes( const int ir1 = MIN(ir0 + dr, nr); if (src0->type == dst->type && - ne00 == ne0 && + ggml_are_same_shape(src0, dst) && nb00 == type_size && nb0 == type_size) { // copy by rows - const size_t rs = ne00 * type_size; + const size_t rs = ggml_row_size(src0->type, ne00); for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = ir0; i01 < ir1; i01++) { @@ -4140,17 +4139,20 @@ static void ggml_compute_forward_dup_bytes( } // dst counters - - int64_t i10 = 0; + int64_t k10 = 0; int64_t i11 = 0; int64_t i12 = 0; int64_t i13 = 0; + // number of blocks in a row + const int64_t nk00 = ne00 / ggml_blck_size(src0->type); + const int64_t nk0 = ne0 / ggml_blck_size(dst->type); + for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - i10 += ne00 * ir0; - while (i10 >= ne0) { - i10 -= ne0; + k10 += nk00 * ir0; + while (k10 >= nk0) { + k10 -= nk0; if (++i11 == ne1) { i11 = 0; if (++i12 == ne2) { @@ -4162,14 +4164,14 @@ static void ggml_compute_forward_dup_bytes( } } for (int64_t i01 = ir0; i01 < ir1; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + for (int64_t k00 = 0; k00 < nk00; k00++) { + const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); memcpy(dst_ptr, src0_ptr, type_size); - if (++i10 == ne0) { - i10 = 0; + if (++k10 == nk0) { + k10 = 0; if (++i11 == ne1) { i11 = 0; if (++i12 == ne2) { @@ -4182,9 +4184,9 @@ static void ggml_compute_forward_dup_bytes( } } } - i10 += ne00 * (ne01 - ir1); - while (i10 >= ne0) { - i10 -= ne0; + k10 += nk00 * (ne01 - ir1); + while (k10 >= nk0) { + k10 -= nk0; if (++i11 == ne1) { i11 = 0; if (++i12 == ne2) { @@ -14067,7 +14069,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } // extra_buffer op? - if (ggml_cpu_extra_compute_forward(params, tensor)) return; + if (ggml_cpu_extra_compute_forward(params, tensor)) { + return; + } switch (tensor->op) { case GGML_OP_DUP: diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index b4e3631ed081a..3119e93a5880f 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1459,11 +1459,13 @@ struct test_cpy : public test_case { const ggml_type type_src; const ggml_type type_dst; const std::array ne; - const std::array permute; + const std::array permute_src; + const std::array permute_dst; bool _src_use_permute; + bool _dst_use_permute; std::string vars() override { - return VARS_TO_STR4(type_src, type_dst, ne, permute); + return VARS_TO_STR5(type_src, type_dst, ne, permute_src, permute_dst); } double max_nmse_err() override { @@ -1476,9 +1478,11 @@ struct test_cpy : public test_case { test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32, std::array ne = {10, 10, 10, 1}, - std::array permute = {0, 0, 0, 0}) - : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute), - _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {} + std::array permute_src = {0, 0, 0, 0}, + std::array permute_dst = {0, 0, 0, 0}) + : type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst), + _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0), + _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data()); @@ -1486,13 +1490,18 @@ struct test_cpy : public test_case { ggml_set_name(src, "src"); if (_src_use_permute) { - src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]); + src = ggml_permute(ctx, src, permute_src[0], permute_src[1], permute_src[2], permute_src[3]); ggml_set_name(src, "src_permuted"); } - ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne); + ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, src->ne); ggml_set_name(dst, "dst"); + if (_dst_use_permute) { + dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]); + ggml_set_name(dst, "dst_permuted"); + } + ggml_tensor * out = ggml_cpy(ctx, src, dst); ggml_set_name(out, "out"); @@ -3929,14 +3938,25 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim)); } - for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { + // same-type copy + for (ggml_type type : all_types) { + const auto nk = ggml_blck_size(type); + + for (int k = 1; k < 4; ++k) { + test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4})); + test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 3, 1, 2}, {0, 2, 1, 3})); + } + } + + for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) { for (ggml_type type_dst : all_types) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows } } - for (ggml_type type_dst : {GGML_TYPE_F32}) { - for (ggml_type type_src : all_types) { + for (ggml_type type_src : all_types) { + for (ggml_type type_dst : {GGML_TYPE_F32}) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows }