Skip to content

Commit

Permalink
vulkan: query register count and use it in a better split_k heuristic
Browse files Browse the repository at this point in the history
Use VK_KHR_pipeline_executable_properties to query the register count, and
use that to try to better estimate how many workgroups can fit in the SMs.
Particularly with recent tile size changes (#12258) the old heuristic is
out of date.

This heuristic benefits both coopmat1 and coopmat2 paths on NVIDIA. Would
be good if somebody can hook up the missing details for other hardware.

Calling getPipelineExecutableStatisticsKHR required more fully initializing
Vulkan-HPP. The steps needed are documented in the Vulkan-HPP readme.
  • Loading branch information
jeffbolznv committed Mar 11, 2025
1 parent 9a6eaed commit f4372a6
Showing 1 changed file with 69 additions and 7 deletions.
76 changes: 69 additions & 7 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,14 @@
#include "ggml-cpu.h"
#endif

// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1

#include <vulkan/vulkan.hpp>

// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE

#include <algorithm>
#include <cmath>
#include <iomanip>
Expand Down Expand Up @@ -90,6 +96,9 @@ struct vk_pipeline_struct {
bool needed {};
// set to true when the shader has been compiled
bool compiled {};
// number of registers used, extracted from pipeline executable properties
uint32_t register_count {};
std::vector<uint32_t> specialization_constants;
};

typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
Expand Down Expand Up @@ -184,6 +193,8 @@ struct vk_device_struct {
uint32_t coopmat_k;
bool coopmat2;

bool pipeline_executable_properties_support {};

size_t idx;

bool mul_mat_l[GGML_TYPE_COUNT];
Expand Down Expand Up @@ -893,6 +904,20 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
}
pipeline->compiled = true;

if (device->pipeline_executable_properties_support) {
vk::PipelineExecutableInfoKHR executableInfo;
executableInfo.pipeline = pipeline->pipeline;

auto statistics = device->device.getPipelineExecutableStatisticsKHR(executableInfo);
for (auto & s : statistics) {
VK_LOG_DEBUG(pipeline->name << " " << s.name << ": " << s.value.u64);
// "Register Count" is reported by NVIDIA drivers.
if (strcmp(s.name, "Register Count") == 0) {
pipeline->register_count = (uint32_t)s.value.u64;
}
}
}

{
std::lock_guard<std::mutex> guard(device->mutex);
device->pipelines.insert({ pipeline->name, pipeline });
Expand Down Expand Up @@ -1581,6 +1606,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
pipeline->push_constant_size = push_constant_size;
pipeline->wg_denoms = wg_denoms;
pipeline->align = align;
pipeline->specialization_constants = specialization_constants;
}

if (!pipeline->needed || pipeline->compiled) {
Expand Down Expand Up @@ -2289,6 +2315,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
bool amd_shader_core_properties2 = false;
bool pipeline_robustness = false;
bool coopmat2_support = false;
bool pipeline_executable_properties_support = false;
device->coopmat_support = false;

// Check if maintenance4 is supported
Expand Down Expand Up @@ -2316,6 +2343,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
} else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
!getenv("GGML_VK_DISABLE_COOPMAT2")) {
coopmat2_support = true;
} else if (strcmp("VK_KHR_pipeline_executable_properties", properties.extensionName) == 0) {
pipeline_executable_properties_support = true;
}
}

Expand Down Expand Up @@ -2500,8 +2529,18 @@ static vk_device ggml_vk_get_device(size_t idx) {
device_extensions.push_back("VK_KHR_maintenance4");
}

VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR pep_features {};
pep_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR;
if (pipeline_executable_properties_support) {
last_struct->pNext = (VkBaseOutStructure *)&pep_features;
last_struct = (VkBaseOutStructure *)&pep_features;
device_extensions.push_back("VK_KHR_pipeline_executable_properties");
}

vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);

device->pipeline_executable_properties_support = pipeline_executable_properties_support;

device->fp16 = device->fp16 && vk12_features.shaderFloat16;

device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
Expand Down Expand Up @@ -2876,6 +2915,9 @@ static void ggml_vk_instance_init() {
}
VK_LOG_DEBUG("ggml_vk_instance_init()");

// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
VULKAN_HPP_DEFAULT_DISPATCHER.init();

uint32_t api_version = vk::enumerateInstanceVersion();

if (api_version < VK_API_VERSION_1_2) {
Expand Down Expand Up @@ -2928,6 +2970,9 @@ static void ggml_vk_instance_init() {
vk_instance.instance = vk::createInstance(instance_create_info);
vk_instance_initialized = true;

// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers
VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance);

size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();

// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
Expand Down Expand Up @@ -3832,12 +3877,21 @@ static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int
VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");

uint32_t split_k = 1;
if (ctx->device->shader_core_count != 0 && m >= (int)pipeline->wg_denoms[0] && n >= (int)pipeline->wg_denoms[1]) {
// If k is 'large' and the SMs will fill less than halfway, use split_k.
if (ctx->device->shader_core_count != 0) {
uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]);
uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]);
if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count / 2) {
split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
uint32_t occupancy_factor = 1;
// Estimate how many workgroups can fit on an SM at a time.
// Other factors like shared memory could affect this, and aren't taken into account.
if (ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA && pipeline->register_count > 0) {
uint32_t block_size = pipeline->specialization_constants[0];
assert(block_size > 0);
occupancy_factor = 65536 / (block_size * pipeline->register_count);
}
// The extra factor of 4 is to try to run up to 4x as many workgroups as can fit,
// to prefer shorter shaders that will be less prone to tail effects
if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count * occupancy_factor * 4) {
split_k = occupancy_factor * 4 * ctx->device->shader_core_count / (m_tiles * n_tiles);
// Clamp to 2 or 4
split_k = std::min(split_k, 4u);
if (split_k == 3) {
Expand Down Expand Up @@ -4122,7 +4176,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
const int y_ne = padded_n * ne10;
const int d_ne = ne11 * ne01;

const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline);
uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline);

const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
Expand All @@ -4146,10 +4200,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT

const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0;
if (dryrun) {
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0;
if (
(qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) ||
(qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) ||
Expand All @@ -4174,11 +4228,19 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
if (qy_needs_dequant) {
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
}
if (split_k > 1) {
// ggml_vk_guess_split_k may make a different determination after the pipeline
// is compiled (based on register count), so prepare for split_k just in case.
if (split_k > 1 || !pipeline->compiled) {
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
}
return;
}
// ggml_vk_guess_split_k may make a different determination after the pipeline
// is compiled (based on register count). Fallback to no split_k if we didn't
// reserve enough memory.
if (split_k_size > ctx->prealloc_size_split_k) {
split_k = 1;
}

vk_buffer d_D = dst_buf_ctx->dev_buffer;
const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
Expand Down

0 comments on commit f4372a6

Please sign in to comment.