#include "ggml-vulkan.h" #include #if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS) #include #include "ggml-cpu.h" #endif // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers- #define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 // We use VULKAN_HPP_DEFAULT_DISPATCHER, but not VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE // to avoid conflicts with applications or other libraries who might use it. #if VK_HEADER_VERSION >= 301 namespace vk::detail { class DispatchLoaderDynamic; } using vk::detail::DispatchLoaderDynamic; #else namespace vk { class DispatchLoaderDynamic; } using vk::DispatchLoaderDynamic; #endif DispatchLoaderDynamic & ggml_vk_default_dispatcher(); #define VULKAN_HPP_DEFAULT_DISPATCHER ggml_vk_default_dispatcher() #include // SPIR-V Headers: different SDK installations expose different include paths. // LunarG Vulkan SDK on Windows typically provides . // Linux packages, MSYS2 and MinGW often use the Khronos layout . #if __has_include() # include #elif __has_include() # include #elif __has_include() # include #else // Fallback to let the compiler throw a standard "file not found" error # include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(_MSC_VER) # define NOMINMAX 1 # include # define YIELD() YieldProcessor() #elif defined(__clang__) || defined(__GNUC__) # if defined(__x86_64__) ||defined(__i386__) # include # define YIELD() _mm_pause() # elif defined(__arm__) || defined(__aarch64__) # if defined(__clang__) # include # define YIELD() __yield() # else # define YIELD() asm volatile("yield") # endif # endif #endif #if !defined(YIELD) #define YIELD() #endif #include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-vulkan-shaders.hpp" // remove this once it's more widely available in the SDK #if !defined(VK_KHR_shader_bfloat16) #define VK_KHR_shader_bfloat16 1 #define VK_KHR_SHADER_BFLOAT16_SPEC_VERSION 1 #define VK_KHR_SHADER_BFLOAT16_EXTENSION_NAME "VK_KHR_shader_bfloat16" #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR ((VkStructureType)1000141000) #define VK_COMPONENT_TYPE_BFLOAT16_KHR ((VkComponentTypeKHR)1000141000) typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR { VkStructureType sType; void* pNext; VkBool32 shaderBFloat16Type; VkBool32 shaderBFloat16DotProduct; VkBool32 shaderBFloat16CooperativeMatrix; } VkPhysicalDeviceShaderBfloat16FeaturesKHR; #endif #define ROUNDUP_POW2(M, N) (((M) + (N) - 1) & ~((N) - 1)) #define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; } #define VK_VENDOR_ID_AMD 0x1002 #define VK_VENDOR_ID_APPLE 0x106b #define VK_VENDOR_ID_INTEL 0x8086 #define VK_VENDOR_ID_NVIDIA 0x10de #define VK_VENDOR_ID_QUALCOMM 0x5143 #define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256 #define GGML_VK_MAX_NODES 8192 #define VK_CHECK(err, msg) \ do { \ vk::Result err_ = (err); \ if (err_ != vk::Result::eSuccess) { \ fprintf(stderr, "ggml_vulkan: %s error %s at %s:%d\n", \ #err, to_string(err_).c_str(), __FILE__, __LINE__); \ exit(1); \ } \ } while (0) #ifdef GGML_VULKAN_DEBUG #define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl #else #define VK_LOG_DEBUG(msg) ((void) 0) #endif // GGML_VULKAN_DEBUG struct ggml_backend_vk_context; #define MAX_PARAMETER_COUNT 12 // Max number of adds that can be fused without exceeding MAX_PARAMETER_COUNT. #define MAX_FUSED_ADDS (MAX_PARAMETER_COUNT - 3) typedef std::shared_ptr vk_pipeline; struct vk_pipeline_struct { std::string name; vk::ShaderModule shader_module; vk::PipelineLayout layout; vk::Pipeline pipeline; uint32_t push_constant_size; uint32_t parameter_count; std::array wg_denoms; uint32_t align; // true if fields have been set by ggml_vk_create_pipeline bool initialized {}; // set to true to request the pipeline is compiled std::atomic needed {}; // set to true when the shader has been compiled std::atomic compiled {}; // number of registers used, extracted from pipeline executable properties uint32_t register_count {}; #if defined(VK_EXT_shader_64bit_indexing) bool is_64b_indexing {}; #endif // linked list of pipelines for multiple compilation variants. // currently only used to compile a 64-bit indexing variant. vk_pipeline next; }; typedef std::weak_ptr vk_pipeline_ref; static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline); struct vk_matmul_pipeline_struct { vk_pipeline l, m, s; vk_pipeline a_l, a_m, a_s; // Returns true when all unaligned pipelines are null. // We only check for unaligned variants since one of the unaligned pipelines must exist // while aligned pipelines are optional bool is_empty() const { return l == nullptr && m == nullptr && s == nullptr; } }; typedef std::shared_ptr vk_matmul_pipeline; struct vk_matmul_pipeline2 { vk_matmul_pipeline2() { f16acc = std::make_shared(); f32acc = std::make_shared(); } vk_matmul_pipeline f32acc; vk_matmul_pipeline f16acc; }; struct vk_device_struct; typedef std::shared_ptr vk_device; typedef std::weak_ptr vk_device_ref; struct vk_buffer_struct; typedef std::shared_ptr vk_buffer; typedef std::weak_ptr vk_buffer_ref; struct ggml_backend_vk_buffer_type_context { std::string name; vk_device device; }; struct vk_queue; struct vk_command_buffer { vk::CommandBuffer buf; uint64_t use_counter = 0; bool in_use = false; }; // Stores command pool/buffers. There's an instance of this // for each (context,queue) pair and for each (device,queue) pair. struct vk_command_pool { void init(vk_device& device, vk_queue *q_); void destroy(vk::Device& device); vk::CommandPool pool; // Using deque so the pointers to command buffers // remain valid even if we add more std::deque cmd_buffers; vk_queue *q; size_t buffers_in_use() const { return std::count_if(cmd_buffers.begin(), cmd_buffers.end(), [](const auto& cb) { return cb.in_use; }); } }; // Prevent simultaneous submissions to the same queue. // This could be per vk_queue if we stopped having two vk_queue structures // sharing the same vk::Queue. static std::mutex queue_mutex; struct vk_queue { uint32_t queue_family_index; vk::Queue queue; vk_command_pool cmd_pool; vk::PipelineStageFlags stage_flags; bool transfer_only; // copy everything except the cmd_pool void copyFrom(vk_queue &other) { queue_family_index = other.queue_family_index; queue = other.queue; stage_flags = other.stage_flags; transfer_only = other.transfer_only; } }; static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft); static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size); static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft); static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft); static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor); static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { /* .get_name = */ ggml_backend_vk_buffer_type_name, /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size, /* .is_host = */ NULL, }; class vk_memory_logger; class vk_perf_logger; static void ggml_vk_destroy_buffer(vk_buffer& buf); static void ggml_vk_synchronize(ggml_backend_vk_context * ctx); static constexpr uint32_t mul_mat_vec_max_cols = 8; static constexpr uint32_t p021_max_gqa_ratio = 8; enum vk_device_architecture { OTHER, AMD_GCN, AMD_RDNA1, AMD_RDNA2, AMD_RDNA3, INTEL_XE2, NVIDIA_PRE_TURING, NVIDIA_TURING, }; static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) { vk::PhysicalDeviceProperties props = device.getProperties(); if (props.vendorID == VK_VENDOR_ID_AMD) { const std::vector ext_props = device.enumerateDeviceExtensionProperties(); bool amd_shader_core_properties = false; bool integer_dot_product = false; bool subgroup_size_control = false; for (const auto& properties : ext_props) { if (strcmp("VK_AMD_shader_core_properties", properties.extensionName) == 0) { amd_shader_core_properties = true; } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) { integer_dot_product = true; } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { subgroup_size_control = true; } } if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control) { return vk_device_architecture::OTHER; } vk::PhysicalDeviceProperties2 props2; vk::PhysicalDeviceShaderCorePropertiesAMD shader_core_props_amd; vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props; vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; props2.pNext = &shader_core_props_amd; shader_core_props_amd.pNext = &integer_dot_props; integer_dot_props.pNext = &subgroup_size_control_props; device.getProperties2(&props2); if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 64) { return vk_device_architecture::AMD_GCN; } if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 32) { // RDNA if (shader_core_props_amd.wavefrontsPerSimd == 20) { return vk_device_architecture::AMD_RDNA1; } if (integer_dot_props.integerDotProduct4x8BitPackedMixedSignednessAccelerated) { return vk_device_architecture::AMD_RDNA3; } return vk_device_architecture::AMD_RDNA2; } } else if (props.vendorID == VK_VENDOR_ID_INTEL) { const std::vector ext_props = device.enumerateDeviceExtensionProperties(); bool subgroup_size_control = false; for (const auto& properties : ext_props) { if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { subgroup_size_control = true; } } if (!subgroup_size_control) { return vk_device_architecture::OTHER; } vk::PhysicalDeviceProperties2 props2; vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; props2.pNext = &subgroup_size_control_props; device.getProperties2(&props2); if (subgroup_size_control_props.minSubgroupSize == 16) { // Xe2 architecture uses SIMD16 while previous Xe and Gen architecture uses SIMD8. // Minimum subgroup size matches the SIMD width so we distinguish architecture by checking this value. // https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html return vk_device_architecture::INTEL_XE2; } } else if (props.vendorID == VK_VENDOR_ID_NVIDIA) { const std::vector ext_props = device.enumerateDeviceExtensionProperties(); bool cooperative_matrix = false; bool sm_builtins = false; // Detect "pre-turing" based on lack of coopmat support. for (const auto& properties : ext_props) { if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) { cooperative_matrix = true; } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) { sm_builtins = true; } } if (!cooperative_matrix) { return vk_device_architecture::NVIDIA_PRE_TURING; } if (sm_builtins) { vk::PhysicalDeviceProperties2 props2; vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props; props2.pNext = &sm_props; device.getProperties2(&props2); // Turing has 32, following architectures have 48 if (sm_props.shaderWarpsPerSM == 32) { return vk_device_architecture::NVIDIA_TURING; } } } return vk_device_architecture::OTHER; } enum vk_conv_shapes { CONV_SHAPE_128x128, CONV_SHAPE_64x32, CONV_SHAPE_32x256, CONV_SHAPE_COUNT, }; struct vk_conv_block_size { uint32_t K; uint32_t NPQ; uint32_t CRS; }; vk_conv_block_size vk_conv_block_sizes[CONV_SHAPE_COUNT] = { // K NPQ CRS { 128, 128, 16 }, // CONV_SHAPE_128x128 { 64, 32, 32 }, // CONV_SHAPE_64x32 { 32, 256, 16 }, // CONV_SHAPE_32x256 }; enum dmmv_wg_sizes { DMMV_WG_SIZE_SUBGROUP, DMMV_WG_SIZE_LARGE, DMMV_WG_SIZE_COUNT, }; enum FaCodePath { FA_SCALAR, FA_COOPMAT1, FA_COOPMAT2, }; struct vk_fa_pipeline_state { uint32_t HSK, HSV; uint32_t Br, Bc; uint32_t D_split, row_split; bool shmem_staging; FaCodePath path; uint32_t workgroup_size, subgroup_size; bool aligned; bool f32acc; uint32_t flags; uint32_t limit_occupancy_shmem; ggml_type k_type; ggml_type v_type; bool operator<(const vk_fa_pipeline_state &b) const { return std::tie(HSK, HSV, Br, Bc, D_split, row_split, shmem_staging, path, workgroup_size, subgroup_size, aligned, f32acc, flags, limit_occupancy_shmem, k_type, v_type) < std::tie(b.HSK, b.HSV, b.Br, b.Bc, b.D_split, b.row_split, b.shmem_staging, b.path, b.workgroup_size, b.subgroup_size, b.aligned, b.f32acc, b.flags, b.limit_occupancy_shmem, b.k_type, b.v_type); } }; struct vk_conv2d_pipeline_state { vk_conv2d_pipeline_state(uint32_t s0, uint32_t s1, uint32_t p0, uint32_t p1, uint32_t d0, uint32_t d1, uint32_t KW, uint32_t KH) : s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), KW(KW), KH(KH) {} uint32_t s0, s1, p0, p1, d0, d1, KW, KH; bool operator<(const vk_conv2d_pipeline_state &b) const { return std::tie(s0, s1, p0, p1, d0, d1, KW, KH) < std::tie(b.s0, b.s1, b.p0, b.p1, b.d0, b.d1, b.KW, b.KH); } }; struct vk_solve_tri_pipeline_state { vk_solve_tri_pipeline_state(uint32_t N, uint32_t K) : N(N), K(K) {} uint32_t N, K; bool operator<(const vk_solve_tri_pipeline_state &b) const { return std::tie(N, K) < std::tie(b.N, b.K); } }; enum shader_reduction_mode { SHADER_REDUCTION_MODE_SHMEM, SHADER_REDUCTION_MODE_HYBRID, SHADER_REDUCTION_MODE_SUBGROUP, SHADER_REDUCTION_MODE_COUNT, }; // argsort pipelines for up to 1<<10 invocations per workgroup static constexpr uint32_t num_argsort_pipelines = 11; static constexpr uint32_t num_topk_moe_pipelines = 10; static constexpr uint32_t num_topk_pipelines = 11; static constexpr std::initializer_list topk_moe_early_softmax_norm{ GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP, GGML_OP_DIV, GGML_OP_RESHAPE }; static constexpr std::initializer_list topk_moe_sigmoid_norm_bias{ GGML_OP_UNARY, GGML_OP_RESHAPE, GGML_OP_ADD, GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP, GGML_OP_DIV, GGML_OP_RESHAPE }; static constexpr std::initializer_list topk_moe_early_softmax { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS }; static constexpr std::initializer_list topk_moe_late_softmax { GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE, GGML_OP_SOFT_MAX, GGML_OP_RESHAPE }; //node #978 ( SOFT_MAX): ffn_moe_probs-15 ( 0K) [Vulka ] use=2: ffn_moe_logits-15 ( 0K) [Vulka ] //node #979 ( RESHAPE): ffn_moe_probs-15 (re ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ] //node #980 ( ARGSORT): ffn_moe_argsort-15 ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ] //node #981 ( VIEW): ffn_moe_topk-15 ( 0K) [Vulka ] use=4: ffn_moe_argsort-15 ( 0K) [Vulka ] //node #982 ( GET_ROWS): ffn_moe_weights-15 ( 0K) [Vulka ] use=1: ffn_moe_probs-15 (re ( 0K) [Vulka ] ffn_moe_topk-15 ( 0K) [Vulka ] //node #983 ( RESHAPE): ffn_moe_weights-15 ( ( 0K) [Vulka ] use=2: ffn_moe_weights-15 ( 0K) [Vulka ] //node #984 ( SUM_ROWS): ffn_moe_weights_sum- ( 0K) [Vulka ] use=1: ffn_moe_weights-15 ( ( 0K) [Vulka ] //node #985 ( CLAMP): ffn_moe_weights_sum_ ( 0K) [Vulka ] use=1: ffn_moe_weights_sum- ( 0K) [Vulka ] //node #986 ( DIV): ffn_moe_weights_norm ( 0K) [Vulka ] use=1: ffn_moe_weights-15 ( ( 0K) [Vulka ] ffn_moe_weights_sum_ ( 0K) [Vulka ] //node #987 ( RESHAPE): ffn_moe_weights_norm ( 0K) [Vulka ] use=1: ffn_moe_weights_norm ( 0K) [Vulka ] static constexpr std::initializer_list> topk_moe_early_softmax_norm_edges { { 1, 0, 0 }, // reshape->src[0] == softmax { 2, 0, 0 }, // argsort->src[0] == softmax { 3, 0, 2 }, // view->src[0] == argsort { 4, 0, 1 }, // get_rows->src[0] == reshape { 4, 1, 3 }, // get_rows->src[1] == view { 5, 0, 4 }, // reshape->src[0] == get_rows { 6, 0, 5 }, // sum_rows->src[0] == reshape { 7, 0, 6 }, // clamp->src[0] == sum_rows { 8, 0, 5 }, // div->src[0] == reshape { 8, 1, 7 }, // div->src[1] == clamp { 9, 0, 8 }, // reshape->src[0] == div }; //node #436 ( UNARY): ffn_moe_probs-10 ( 256K) [Vulka ] use=2: ffn_moe_logits-10 ( 256K) [Vulka ] //node #437 ( RESHAPE): ffn_moe_probs-10 (re ( 256K) [Vulka ] use=1: ffn_moe_probs-10 ( 256K) [Vulka ] //node #438 ( ADD): ffn_moe_probs_biased ( 256K) [Vulka ] use=1: ffn_moe_probs-10 ( 256K) [Vulka ] blk.10.exp_probs_b.b ( 0K) [Vulka ] //node #439 ( ARGSORT): ffn_moe_argsort-10 ( 256K) [Vulka ] use=1: ffn_moe_probs_biased ( 256K) [Vulka ] //node #440 ( VIEW): ffn_moe_topk-10 ( 255K) [Vulka ] use=3: ffn_moe_argsort-10 ( 256K) [Vulka ] //node #441 ( GET_ROWS): ffn_moe_weights-10 ( 12K) [Vulka ] use=1: ffn_moe_probs-10 (re ( 256K) [Vulka ] ffn_moe_topk-10 ( 255K) [Vulka ] //node #442 ( RESHAPE): ffn_moe_weights-10 ( ( 12K) [Vulka ] use=2: ffn_moe_weights-10 ( 12K) [Vulka ] //node #443 ( SUM_ROWS): ffn_moe_weights_sum- ( 2K) [Vulka ] use=1: ffn_moe_weights-10 ( ( 12K) [Vulka ] //node #444 ( CLAMP): ffn_moe_weights_sum_ ( 2K) [Vulka ] use=1: ffn_moe_weights_sum- ( 2K) [Vulka ] //node #445 ( DIV): ffn_moe_weights_norm ( 12K) [Vulka ] use=1: ffn_moe_weights-10 ( ( 12K) [Vulka ] ffn_moe_weights_sum_ ( 2K) [Vulka ] //node #446 ( RESHAPE): ffn_moe_weights_norm ( 12K) [Vulka ] use=1: ffn_moe_weights_norm ( 12K) [Vulka ] static constexpr std::initializer_list> topk_moe_sigmoid_norm_bias_edges { { 1, 0, 0 }, // reshape->src[0] == sigmoid { 2, 0, 0 }, // add->src[0] == sigmoid { 3, 0, 2 }, // argsort->src[0] == add { 4, 0, 3 }, // view->src[0] == argsort { 5, 0, 1 }, // get_rows->src[0] == reshape { 5, 1, 4 }, // get_rows->src[1] == view { 6, 0, 5 }, // reshape->src[0] == get_rows { 7, 0, 6 }, // sum_rows->src[0] == reshape { 8, 0, 7 }, // clamp->src[0] == sum_rows { 9, 0, 6 }, // div->src[0] == reshape { 9, 1, 8 }, // div->src[1] == clamp {10, 0, 9 }, // reshape->src[0] == div }; // same as early_softmax_norm but ending after the get_rows static constexpr std::initializer_list> topk_moe_early_softmax_edges { { 1, 0, 0 }, // reshape->src[0] == softmax { 2, 0, 0 }, // argsort->src[0] == softmax { 3, 0, 2 }, // view->src[0] == argsort { 4, 0, 1 }, // get_rows->src[0] == reshape { 4, 1, 3 }, // get_rows->src[1] == view }; //node #652 ( ARGSORT): ffn_moe_argsort-11 ( 0K) [Vulka ] use=1: ffn_moe_probs-11 ( 0K) [Vulka ] //node #653 ( VIEW): ffn_moe_topk-11 ( 0K) [Vulka ] use=7: ffn_moe_argsort-11 ( 0K) [Vulka ] //node #654 ( GET_ROWS): ffn_moe_weights-11 ( 0K) [Vulka ] use=1: ffn_moe_probs-11 (re ( 0K) [Vulka ] ffn_moe_topk-11 ( 0K) [Vulka ] //node #655 ( RESHAPE): ffn_moe_weights-11 ( ( 0K) [Vulka ] use=1: ffn_moe_weights-11 ( 0K) [Vulka ] //node #656 ( SOFT_MAX): node_656 ( 0K) [Vulka ] use=1: ffn_moe_weights-11 ( ( 0K) [Vulka ] //node #657 ( RESHAPE): ffn_moe_weights_soft ( 0K) [Vulka ] use=1: node_656 ( 0K) [Vulka ] static constexpr std::initializer_list> topk_moe_late_softmax_edges { { 1, 0, 0 }, // view->src[0] == argsort { 2, 1, 1 }, // get_rows->src[1] == view { 3, 0, 2 }, // reshape->src[0] == get_rows { 4, 0, 3 }, // soft_max->src[0] == reshape { 5, 0, 4 }, // reshape->src[0] == soft_max }; enum topk_moe_mode { TOPK_MOE_EARLY_SOFTMAX, TOPK_MOE_EARLY_SOFTMAX_NORM, TOPK_MOE_LATE_SOFTMAX, TOPK_MOE_SIGMOID_NORM_BIAS, TOPK_MOE_COUNT, }; static constexpr std::initializer_list> rope_view_set_rows_edges { { 1, 0, 0 }, // view->src[0] == rope { 2, 0, 1 }, // set_rows->src[0] == view }; static constexpr std::initializer_list> rms_norm_mul_rope_view_set_rows_edges { { 1, 0, 0 }, // mul->src[0] == rms { 2, 0, 1 }, // rope->src[0] == mul { 3, 0, 2 }, // view->src[0] == rope { 4, 0, 3 }, // set_rows->src[0] == view }; struct vk_device_struct { std::recursive_mutex mutex; vk::PhysicalDevice physical_device; vk::PhysicalDeviceProperties properties; std::string name; uint64_t max_memory_allocation_size; uint64_t max_buffer_size; uint64_t suballocation_block_size; uint64_t min_imported_host_pointer_alignment; bool external_memory_host {}; bool fp16; bool bf16; bool pipeline_robustness; bool memory_priority; vk::Device device; uint32_t vendor_id; vk::DriverId driver_id; vk_device_architecture architecture; vk_queue compute_queue; vk_queue transfer_queue; bool single_queue; bool support_async; bool async_use_transfer_queue; uint32_t subgroup_size; uint32_t subgroup_size_log2; uint32_t shader_core_count; bool uma; bool prefer_host_memory; bool float_controls_rte_fp16; bool subgroup_basic; bool subgroup_arithmetic; bool subgroup_shuffle; bool subgroup_ballot; bool subgroup_clustered; bool subgroup_vote; bool multi_add; bool shader_int64; bool buffer_device_address; bool vulkan_memory_model; bool add_rms_fusion; uint32_t partials_binding_alignment; bool shader_64b_indexing; bool integer_dot_product; // 0: default, 1: force mmvq, -1: disable mmvq int32_t mmvq_mode; bool subgroup_size_control; uint32_t subgroup_min_size; uint32_t subgroup_max_size; bool subgroup_require_full_support; // floor(log2(maxComputeWorkGroupInvocations)) uint32_t max_workgroup_size_log2 {}; bool coopmat_support; bool coopmat_acc_f32_support {}; bool coopmat_acc_f16_support {}; bool coopmat_bf16_support {}; bool coopmat_support_16x16x16_f16acc {}; bool coopmat_support_16x16x16_f32acc {}; bool coopmat1_fa_support {}; uint32_t coopmat_m; uint32_t coopmat_n; uint32_t coopmat_k; bool coopmat_int_support; uint32_t coopmat_int_m; uint32_t coopmat_int_n; uint32_t coopmat_int_k; bool coopmat2; bool pipeline_executable_properties_support {}; size_t idx; bool mul_mat_l[GGML_TYPE_COUNT]; bool mul_mat_m[GGML_TYPE_COUNT]; bool mul_mat_s[GGML_TYPE_COUNT]; bool mul_mat_id_l[GGML_TYPE_COUNT]; bool mul_mat_id_m[GGML_TYPE_COUNT]; bool mul_mat_id_s[GGML_TYPE_COUNT]; vk::DescriptorSetLayout dsl; vk_matmul_pipeline pipeline_matmul_f32 {}; vk_matmul_pipeline pipeline_matmul_f32_f16 {}; vk_matmul_pipeline pipeline_matmul_bf16 {}; vk_matmul_pipeline2 pipeline_matmul_f16; vk_matmul_pipeline2 pipeline_matmul_f16_f32; vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT]; vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT]; vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_COUNT]; vk_matmul_pipeline pipeline_matmul_id_f32 {}; vk_matmul_pipeline pipeline_matmul_id_bf16 {}; vk_matmul_pipeline2 pipeline_matmul_id_f16; vk_matmul_pipeline2 pipeline_matmul_id_f16_f32; vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT]; vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_COUNT]; vk_pipeline pipeline_matmul_split_k_reduce; vk_pipeline pipeline_quantize_q8_1_x4; vk_pipeline pipeline_dequant[GGML_TYPE_COUNT]; vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT][mul_mat_vec_max_cols]; vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT][mul_mat_vec_max_cols]; vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT]; vk_pipeline pipeline_dequant_mul_mat_vec_q8_1_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT][mul_mat_vec_max_cols]; vk_pipeline pipeline_dequant_mul_mat_vec_id_q8_1_f32[DMMV_WG_SIZE_COUNT][GGML_TYPE_COUNT]; vk_pipeline pipeline_mul_mat_vec_p021_f16_f32[p021_max_gqa_ratio]; vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT]; vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT]; vk_pipeline pipeline_acc_f32; vk_pipeline pipeline_set_f32; // [src0 0=fp32,1=fp16][src1 0=fp32,1=fp16][dst 0=fp32,1=fp16] vk_pipeline pipeline_add[2][2][2]; vk_pipeline pipeline_add_norepeat[2][2][2]; vk_pipeline pipeline_sub[2][2][2]; vk_pipeline pipeline_sub_norepeat[2][2][2]; vk_pipeline pipeline_mul[2][2][2]; vk_pipeline pipeline_mul_norepeat[2][2][2]; vk_pipeline pipeline_div[2][2][2]; vk_pipeline pipeline_div_norepeat[2][2][2]; vk_pipeline pipeline_add_rms[2][2][2]; vk_pipeline pipeline_add_rms_norepeat[2][2][2]; // indexed by num_additional_fused_ops == num_adds - 1 vk_pipeline pipeline_multi_add[MAX_FUSED_ADDS]; vk_pipeline pipeline_multi_add_rms[MAX_FUSED_ADDS]; vk_pipeline pipeline_add_id_f32; vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32; vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bicubic_f32, pipeline_upscale_bilinear_antialias_f32; vk_pipeline pipeline_scale_f32; vk_pipeline pipeline_sqr_f32; vk_pipeline pipeline_sqrt_f32; vk_pipeline pipeline_sin_f32; vk_pipeline pipeline_cos_f32; vk_pipeline pipeline_log[2]; vk_pipeline pipeline_tri[2]; vk_pipeline pipeline_diag[2]; vk_pipeline pipeline_clamp_f32; vk_pipeline pipeline_pad_f32; vk_pipeline pipeline_roll_f32; vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32; vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16, pipeline_cpy_f32_i32, pipeline_cpy_i32_f32; vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16, pipeline_contig_cpy_f32_i32, pipeline_contig_cpy_i32_f32; vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT]; vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT]; vk_pipeline pipeline_cpy_transpose_16, pipeline_cpy_transpose_32; vk_pipeline pipeline_set_rows_i32[GGML_TYPE_COUNT]; vk_pipeline pipeline_set_rows_i64[GGML_TYPE_COUNT]; vk_pipeline pipeline_norm_f32; vk_pipeline pipeline_group_norm_f32; vk_pipeline pipeline_rms_norm_f32; vk_pipeline pipeline_rms_norm_mul_f32; vk_pipeline pipeline_rms_norm_partials_f32; vk_pipeline pipeline_rms_norm_mul_partials_f32; vk_pipeline pipeline_rms_norm_mul_rope_f32_f32; vk_pipeline pipeline_rms_norm_mul_rope_f32_f16; vk_pipeline pipeline_rms_norm_back_f32; vk_pipeline pipeline_l2_norm_f32; // [src/dst 0=fp32,1=fp16] vk_pipeline pipeline_exp[2]; vk_pipeline pipeline_elu[2]; vk_pipeline pipeline_gelu[2]; vk_pipeline pipeline_gelu_erf[2]; vk_pipeline pipeline_gelu_quick[2]; vk_pipeline pipeline_silu[2]; vk_pipeline pipeline_relu[2]; vk_pipeline pipeline_xielu[2]; vk_pipeline pipeline_neg[2]; vk_pipeline pipeline_tanh[2]; vk_pipeline pipeline_sigmoid[2]; vk_pipeline pipeline_hardsigmoid[2]; vk_pipeline pipeline_hardswish[2]; vk_pipeline pipeline_abs[2]; vk_pipeline pipeline_softplus[2]; vk_pipeline pipeline_step[2]; vk_pipeline pipeline_round[2]; vk_pipeline pipeline_ceil[2]; vk_pipeline pipeline_floor[2]; vk_pipeline pipeline_trunc[2]; vk_pipeline pipeline_sgn[2]; vk_pipeline pipeline_add1_f16_f16; vk_pipeline pipeline_add1_f16_f32; vk_pipeline pipeline_add1_f32_f32; vk_pipeline pipeline_arange_f32; vk_pipeline pipeline_fill_f32; vk_pipeline pipeline_fill_f16; vk_pipeline pipeline_geglu[2]; vk_pipeline pipeline_reglu[2]; vk_pipeline pipeline_swiglu[2]; vk_pipeline pipeline_swiglu_oai[2]; vk_pipeline pipeline_geglu_erf[2]; vk_pipeline pipeline_geglu_quick[2]; vk_pipeline pipeline_leaky_relu_f32; vk_pipeline pipeline_silu_back_f32; vk_pipeline pipeline_diag_mask_inf_f32; vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16; vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512; vk_pipeline pipeline_soft_max_back_f32; vk_pipeline pipeline_soft_max_large1_f32, pipeline_soft_max_large1_f32_f16; vk_pipeline pipeline_soft_max_large2_f32, pipeline_soft_max_large2_f32_f16; vk_pipeline pipeline_soft_max_large3_f32, pipeline_soft_max_large3_f32_f16; vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16; vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16; vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16, pipeline_rope_multi_f32_f16; vk_pipeline pipeline_rope_vision_f32, pipeline_rope_vision_f16; vk_pipeline pipeline_argsort_f32[num_argsort_pipelines]; vk_pipeline pipeline_argsort_large_f32[num_argsort_pipelines]; vk_pipeline pipeline_topk_f32[num_topk_pipelines]; vk_pipeline pipeline_sum_rows_f32; vk_pipeline pipeline_cumsum_f32; vk_pipeline pipeline_cumsum_small_f32; vk_pipeline pipeline_cumsum_multipass1_f32; vk_pipeline pipeline_cumsum_multipass2_f32; vk_pipeline pipeline_argmax_f32; vk_pipeline pipeline_count_equal_i32; std::map pipeline_solve_tri_f32; vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16; vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16; vk_pipeline pipeline_timestep_embedding_f32; vk_pipeline pipeline_conv_transpose_1d_f32; vk_pipeline pipeline_pool2d_f32; vk_pipeline pipeline_rwkv_wkv6_f32; vk_pipeline pipeline_rwkv_wkv7_f32; // [size_idx][kda] where size_idx: 0=d32, 1=d64, 2=d128 vk_pipeline pipeline_gated_delta_net[3][2]; vk_pipeline pipeline_ssm_scan_f32_d128; vk_pipeline pipeline_ssm_scan_f32_d256; vk_pipeline pipeline_ssm_conv_f32; vk_pipeline pipeline_opt_step_adamw_f32; vk_pipeline pipeline_opt_step_sgd_f32; std::map pipeline_conv2d_f32[CONV_SHAPE_COUNT]; std::map pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT]; std::map pipeline_conv_transpose_2d_f32[CONV_SHAPE_COUNT]; std::map pipeline_conv_transpose_2d_f16_f32[CONV_SHAPE_COUNT]; vk_pipeline pipeline_conv2d_dw_whcn_f32, pipeline_conv2d_dw_whcn_f16_f32; vk_pipeline pipeline_conv2d_dw_cwhn_f32, pipeline_conv2d_dw_cwhn_f16_f32; std::map pipeline_flash_attn_f32_f16[GGML_TYPE_COUNT]; std::map, vk_pipeline> pipeline_fa_mask_opt; vk_pipeline pipeline_flash_attn_split_k_reduce; vk_pipeline pipeline_count_experts; // [2] is for whether to take n_experts from spec constant (0) or push constant (1) vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][2]; std::vector all_pipelines; std::vector> pinned_memory; vk::Fence fence; vk_buffer sync_staging; ggml_backend_buffer_type buffer_type; bool disable_fusion; bool disable_host_visible_vidmem; bool allow_sysmem_fallback; bool disable_graph_optimize; std::unique_ptr memory_logger; ~vk_device_struct() { VK_LOG_DEBUG("destroy device " << name); device.destroyFence(fence); ggml_vk_destroy_buffer(sync_staging); compute_queue.cmd_pool.destroy(device); transfer_queue.cmd_pool.destroy(device); for (auto& pipeline : all_pipelines) { if (pipeline.expired()) { continue; } vk_pipeline pl = pipeline.lock(); ggml_vk_destroy_pipeline(device, pl); } all_pipelines.clear(); device.destroyDescriptorSetLayout(dsl); device.destroy(); } }; void vk_command_pool::init(vk_device& device, vk_queue *q_) { cmd_buffers.clear(); q = q_; vk::CommandPoolCreateInfo command_pool_create_info( vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT), q->queue_family_index); pool = device->device.createCommandPool(command_pool_create_info); } void vk_command_pool::destroy(vk::Device& device) { device.destroyCommandPool(pool); pool = nullptr; cmd_buffers.clear(); } struct vk_buffer_struct { vk::Buffer buffer = VK_NULL_HANDLE; vk::DeviceMemory device_memory = VK_NULL_HANDLE; vk::MemoryPropertyFlags memory_property_flags; void * ptr; size_t size = 0; vk::DeviceAddress bda_addr {}; vk_device device; ~vk_buffer_struct() { if (size == 0) { return; } VK_LOG_DEBUG("~vk_buffer_struct(" << buffer << ", " << size << ")"); device->device.freeMemory(device_memory); device->device.destroyBuffer(buffer); } }; struct vk_subbuffer { vk_buffer buffer; uint64_t offset; uint64_t size; operator vk::DescriptorBufferInfo() const { return { buffer->buffer, offset, size }; } }; struct vk_semaphore { vk::Semaphore s; uint64_t value; }; // vk_event is used for the event-related backend interfaces. It uses vk::Events for // event_wait and a timeline semaphore for event_synchronize. Polling on an event for // event_synchronize wouldn't be sufficient to wait for command buffers to complete, // and would lead to validation errors. struct vk_event { std::vector events_free; // Events available for reuse std::vector events_submitted; // Events that are fully submitted and can be reused on next synchronize vk::Event event; bool has_event; vk_semaphore tl_semaphore; vk_command_buffer* cmd_buffer = nullptr; uint64_t cmd_buffer_use_counter = 0; }; struct vk_submission { vk_command_buffer* buffer = nullptr; std::vector wait_semaphores; std::vector signal_semaphores; }; typedef std::vector vk_sequence; struct vk_mat_mat_push_constants { uint32_t M; uint32_t N; uint32_t K; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; uint32_t base_work_group_z; uint32_t num_batches; uint32_t k_split; uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3; uint32_t padded_N; }; #define MAT_VEC_FUSION_FLAGS_BIAS0 0x1 #define MAT_VEC_FUSION_FLAGS_BIAS1 0x2 #define MAT_VEC_FUSION_FLAGS_SCALE0 0x4 #define MAT_VEC_FUSION_FLAGS_SCALE1 0x8 struct vk_mat_vec_push_constants { uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; uint32_t fusion_flags; uint32_t base_work_group_y; uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3; }; struct vk_mat_vec_p021_push_constants { uint32_t ncols_x; uint32_t nrows_x; uint32_t nchannels_x; uint32_t nchannels_y; uint32_t b_offset; uint32_t d_offset; uint32_t fusion_flags; }; struct vk_mat_vec_nc_push_constants { uint32_t ncols_x; uint32_t nrows_x; uint32_t row_stride_x; uint32_t channel_stride_x; uint32_t channel_stride_y; uint32_t channel_x_divisor; uint32_t ne12; uint32_t b_offset; uint32_t d_offset; uint32_t nb03; uint32_t nb13; uint32_t nb23; uint32_t fusion_flags; }; struct vk_mat_mat_id_push_constants { uint32_t M; uint32_t N; uint32_t K; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; uint32_t nei0; uint32_t nei1; uint32_t nbi1; uint32_t ne11; uint32_t padded_N; }; struct vk_mat_vec_id_push_constants { uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; uint32_t fusion_flags; uint32_t nei0; uint32_t ne11; uint32_t expert_i1; uint32_t nbi1; }; struct vk_flash_attn_push_constants { uint32_t N; uint32_t KV; uint32_t ne1; uint32_t ne2; uint32_t ne3; uint32_t neq2; uint32_t neq3; uint32_t nek2; uint32_t nek3; uint32_t nev2; uint32_t nev3; uint32_t nem1; uint32_t nem2; uint32_t nem3; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t nb21; uint32_t nb22; uint32_t nb23; float scale; float max_bias; float logit_softcap; uint32_t mask_n_head_log2; float m0; float m1; uint32_t gqa_ratio; uint32_t split_kv; uint32_t k_num; }; static_assert(sizeof(vk_flash_attn_push_constants) <= 128, "sizeof(vk_flash_attn_push_constants) must be <= 128"); struct vk_op_push_constants { uint32_t KX; uint32_t KY; float param1; float param2; float param3; float param4; }; struct vk_op_count_experts_push_constants { uint32_t ne00; uint32_t ne01; uint32_t nb00; uint32_t nb01; uint32_t a_offset; }; struct vk_op_glu_push_constants { uint32_t N; uint32_t ne00; uint32_t ne20; uint32_t mode; // 0: default, 1: swapped, 2: split float alpha; // for swiglu_oai float limit; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t ne01; uint32_t ne02; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t ne11; uint32_t ne12; }; struct vk_op_unary_push_constants { uint32_t ne; uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t misalign_offsets; float param1; float param2; uint32_t ne0_012mp; uint32_t ne0_012L; uint32_t ne0_01mp; uint32_t ne0_01L; uint32_t ne0_0mp; uint32_t ne0_0L; uint32_t ne1_012mp; uint32_t ne1_012L; uint32_t ne1_01mp; uint32_t ne1_01L; uint32_t ne1_0mp; uint32_t ne1_0L; }; static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128"); static vk_op_unary_push_constants vk_op_unary_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst, int64_t ne = 0) { GGML_ASSERT(ne != 0 || (ggml_nelements(src0) == ggml_nelements(dst))); ne = ne != 0 ? ne : ggml_nelements(dst); GGML_ASSERT(ne <= (int64_t)std::numeric_limits::max()); vk_op_unary_push_constants p{}; p.ne = (uint32_t)ne; size_t src0_tsize = ggml_type_size(src0->type); p.ne00 = (uint32_t)src0->ne[0]; p.ne01 = (uint32_t)src0->ne[1]; p.ne02 = (uint32_t)src0->ne[2]; p.ne03 = (uint32_t)src0->ne[3]; p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize); p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize); p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize); p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize); size_t dst_tsize = ggml_type_size(dst->type); p.ne10 = (uint32_t)dst->ne[0]; p.ne11 = (uint32_t)dst->ne[1]; p.ne12 = (uint32_t)dst->ne[2]; p.ne13 = (uint32_t)dst->ne[3]; p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize); p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize); p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize); p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize); return p; // offsets are initialized later in ggml_vk_op } struct vk_op_pad_push_constants { uint32_t ne; uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t misalign_offsets; uint32_t circular; uint32_t lp0; uint32_t rp0; uint32_t lp1; uint32_t rp1; uint32_t lp2; uint32_t rp2; uint32_t lp3; uint32_t rp3; }; static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst) { int64_t ne = ggml_nelements(dst); GGML_ASSERT(ne <= (int64_t)std::numeric_limits::max()); vk_op_pad_push_constants p{}; p.ne = (uint32_t)ne; size_t src0_tsize = ggml_type_size(src0->type); p.ne00 = (uint32_t)src0->ne[0]; p.ne01 = (uint32_t)src0->ne[1]; p.ne02 = (uint32_t)src0->ne[2]; p.ne03 = (uint32_t)src0->ne[3]; p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize); p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize); p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize); p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize); size_t dst_tsize = ggml_type_size(dst->type); p.ne10 = (uint32_t)dst->ne[0]; p.ne11 = (uint32_t)dst->ne[1]; p.ne12 = (uint32_t)dst->ne[2]; p.ne13 = (uint32_t)dst->ne[3]; p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize); p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize); p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize); p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize); p.lp0 = dst->op_params[0]; p.rp0 = dst->op_params[1]; p.lp1 = dst->op_params[2]; p.rp1 = dst->op_params[3]; p.lp2 = dst->op_params[4]; p.rp2 = dst->op_params[5]; p.lp3 = dst->op_params[6]; p.rp3 = dst->op_params[7]; p.circular = dst->op_params[8]; return p; // fastdiv values and offsets are initialized later in ggml_vk_op } // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. // Precompute mp (m' in the paper) and L such that division // can be computed using a multiply (high 32b of 64b result) // and a shift: // // n/d = (mulhi(n, mp) + n) >> L; static void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L) { // compute L = ceil(log2(d)); L = 0; while (L < 32 && (uint32_t{1} << L) < d) { L++; } mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1); } template void init_pushconst_fastdiv(T &p) { GGML_UNUSED(p); static_assert(!std::is_const::value, "unexpected type"); } template <> void init_pushconst_fastdiv(vk_op_unary_push_constants &p) { // Compute magic values to divide by these six numbers. init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, p.ne0_012L); init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, p.ne0_01L); init_fastdiv_values(p.ne00, p.ne0_0mp, p.ne0_0L); init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, p.ne1_012L); init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, p.ne1_01L); init_fastdiv_values(p.ne10, p.ne1_0mp, p.ne1_0L); } struct vk_op_binary_push_constants { uint32_t ne; uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23; uint32_t misalign_offsets; float param1; float param2; int32_t param3; }; struct vk_op_multi_add_push_constants { // shape for dst uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; // strides for srcs+dst uint32_t nb[MAX_PARAMETER_COUNT][4]; uint32_t rms_partials; }; // update multi_add.comp if this changes static_assert(MAX_PARAMETER_COUNT == 12); static_assert(sizeof(vk_op_multi_add_push_constants) <= 256); struct vk_op_topk_moe_push_constants { uint32_t n_rows; uint32_t n_experts_push; uint32_t n_expert_used; float clamp_min; float clamp_max; uint32_t gating_func; uint32_t has_bias; uint32_t with_norm; float output_scale; float output_bias; }; struct vk_op_add_id_push_constants { uint32_t ne0; uint32_t ne1; uint32_t s01; uint32_t s02; uint32_t s11; uint32_t s21; }; struct vk_op_diag_mask_push_constants { uint32_t ncols; uint32_t rows_per_channel; int32_t n_past; }; struct vk_op_rope_push_constants { uint32_t rope_mode; uint32_t nrows; uint32_t n_dims; float freq_scale; float freq_base; float ext_factor; float attn_factor; float corr_dims[2]; float theta_scale; uint32_t has_ff; int32_t sections[4]; uint32_t is_imrope; uint32_t is_back; uint32_t set_rows_stride; uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t nb11; uint32_t nb12; uint32_t nb13; }; static_assert(sizeof(vk_op_rope_push_constants) <= 128, "sizeof(vk_op_rope_push_constants) must be <= 128"); // For fused rms_norm+mul+rope(+view+set_rows) struct vk_op_rms_norm_mul_rope_push_constants { vk_op_binary_push_constants bin; vk_op_rope_push_constants rope; }; struct vk_op_soft_max_push_constants { uint32_t KX; uint32_t KY; uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne12; uint32_t ne13; uint32_t nb11; uint32_t nb12; uint32_t nb13; float scale; float max_bias; float m0; float m1; uint32_t n_head_log2; uint32_t nrows_x; uint32_t has_sinks; }; struct vk_op_argsort_push_constants { uint32_t ncols; uint32_t ncols_padded; uint32_t ncols_padded_log2; uint32_t nrows; uint32_t order; uint32_t outer_start; uint32_t outer_end; uint32_t inner_start; uint32_t inner_end; }; struct vk_op_topk_push_constants { uint32_t orig_ncols; uint32_t ncols_input; uint32_t ncols_output; uint32_t k; uint32_t nrows; uint32_t first_pass; uint32_t last_pass; }; struct vk_op_im2col_push_constants { uint64_t dst_addr; uint32_t batch_offset; uint32_t offset_delta; uint32_t IC; uint32_t IW; uint32_t IH; uint32_t OW; uint32_t OH; uint32_t KW; uint32_t KH; uint32_t OH_batch; uint32_t CHW; int32_t s0; int32_t s1; int32_t p0; int32_t p1; int32_t d0; int32_t d1; uint32_t batch_IC; }; struct vk_op_im2col_3d_push_constants { uint64_t dst_addr; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t s0; uint32_t s1; uint32_t s2; uint32_t p0; uint32_t p1; uint32_t p2; uint32_t d0; uint32_t d1; uint32_t d2; uint32_t IW; uint32_t IH; uint32_t ID; uint32_t IC; uint32_t KW; uint32_t OH; uint32_t KD_KH_KW; uint32_t KH_KW; uint32_t IC_KD_KH_KW; uint32_t N_OD_OH; uint32_t OD_OH; uint32_t OD_OH_OW_IC_KD_KH_KW; uint32_t OH_OW_IC_KD_KH_KW; uint32_t OW_IC_KD_KH_KW; uint32_t misalign_offsets; }; struct vk_op_timestep_embedding_push_constants { uint32_t nb1; uint32_t dim; uint32_t max_period; }; struct vk_op_conv_transpose_1d_push_constants { uint32_t Cout; uint32_t Cin; uint32_t K; uint32_t L; uint32_t KL; uint32_t nb01; uint32_t nb02; uint32_t nb11; uint32_t nb1; int32_t s0; }; struct vk_op_pool2d_push_constants { uint32_t IW; uint32_t IH; uint32_t OW; uint32_t OH; uint32_t OC; uint32_t pelements; uint32_t op; int32_t k0; int32_t k1; int32_t s0; int32_t s1; int32_t p0; int32_t p1; }; struct vk_op_rwkv_wkv6_push_constants { uint32_t B; uint32_t T; uint32_t C; uint32_t H; }; struct vk_op_rwkv_wkv7_push_constants { uint32_t B; uint32_t T; uint32_t C; uint32_t H; }; struct vk_op_gated_delta_net_push_constants { uint32_t H; uint32_t n_tokens; uint32_t n_seqs; uint32_t s_off; uint32_t sq1, sq2, sq3; uint32_t sv1, sv2, sv3; uint32_t sb1, sb2, sb3; uint32_t neq1, rq3; float scale; }; struct vk_op_ssm_scan_push_constants { uint32_t nb02, nb03, nb12, nb13; uint32_t nb21, nb22, nb31; uint32_t nb42, nb43, nb52, nb53; uint32_t s_off; uint32_t n_head, d_head, n_group, n_tok; }; struct vk_op_ssm_conv_push_constants { uint32_t nb01, nb02; uint32_t nb11; uint32_t dst_nb0, dst_nb1, dst_nb2; uint32_t nc, ncs, nr, n_t, n_s; }; struct vk_op_conv2d_push_constants { uint32_t Cout; uint32_t Cin; uint32_t N; uint32_t W; uint32_t H; uint32_t OW; uint32_t OH; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t nb1; uint32_t nb2; uint32_t nb3; // init_fastdiv_values constants for dividing by OW, OW*OH uint32_t OWmp; uint32_t OWL; uint32_t OWOHmp; uint32_t OWOHL; }; template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) { // Compute magic values to divide by OW, OW*OH init_fastdiv_values(p.OW, p.OWmp, p.OWL); init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL); } struct vk_op_conv2d_dw_push_constants { uint32_t ne; uint32_t batches; uint32_t channels; uint32_t dst_w; uint32_t dst_h; uint32_t src_w; uint32_t src_h; uint32_t knl_w; uint32_t knl_h; int32_t stride_x; int32_t stride_y; int32_t pad_x; int32_t pad_y; int32_t dilation_x; int32_t dilation_y; }; struct vk_op_upscale_push_constants { uint32_t ne; uint32_t a_offset; uint32_t d_offset; uint32_t ne00; uint32_t ne01; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; float sf0; float sf1; float sf2; float sf3; float pixel_offset; }; struct vk_op_sum_rows_push_constants { uint32_t n_cols; uint32_t ne01, ne02; uint32_t nb01, nb02, nb03; uint32_t nb11, nb12, nb13; float weight; uint32_t misalign_offsets; uint32_t ne0_12mp, ne0_12L; uint32_t ne0_1mp, ne0_1L; }; static vk_op_sum_rows_push_constants vk_op_sum_rows_push_constants_init(const ggml_tensor * src, const ggml_tensor * dst, int64_t n_cols) { uint32_t type_size = (uint32_t)ggml_type_size(src->type); vk_op_sum_rows_push_constants p = {}; p.n_cols = (uint32_t)n_cols; p.ne01 = (uint32_t)src->ne[1]; p.ne02 = (uint32_t)src->ne[2]; p.nb01 = (uint32_t)src->nb[1] / type_size; p.nb02 = (uint32_t)src->nb[2] / type_size; p.nb03 = (uint32_t)src->nb[3] / type_size; p.nb11 = (uint32_t)dst->nb[1] / type_size; p.nb12 = (uint32_t)dst->nb[2] / type_size; p.nb13 = (uint32_t)dst->nb[3] / type_size; p.weight = 1.0f; return p; } template <> void init_pushconst_fastdiv(vk_op_sum_rows_push_constants &p) { init_fastdiv_values(p.ne01*p.ne02, p.ne0_12mp, p.ne0_12L); init_fastdiv_values(p.ne01, p.ne0_1mp, p.ne0_1L); } struct vk_quantize_q8_1_push_constants { uint32_t ne; uint32_t num_blocks; }; struct vk_op_flash_attn_split_k_reduce_push_constants { uint32_t D; uint32_t ne1; uint32_t ne2; uint32_t ne3; uint32_t k_num; uint32_t sinks; }; struct vk_op_flash_attn_mask_opt_push_constants { uint32_t nem0; uint32_t nem1; uint32_t nem2; uint32_t nbm1; uint32_t nbm2; uint32_t nbm3; uint32_t nbd1; uint32_t nbd2; uint32_t nbd3; }; // Allow pre-recording command buffers struct vk_staging_memcpy { vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {} void * dst; const void * src; size_t n; }; struct vk_staging_memset { vk_staging_memset(void * _dst, uint32_t _val, size_t _n) : dst(_dst), val(_val), n(_n) {} void * dst; uint32_t val; size_t n; }; struct vk_context_struct { vk_submission * s; std::vector seqs; int exit_tensor_idx; std::vector in_memcpys; std::vector out_memcpys; std::vector memsets; vk_command_pool * p {}; }; typedef std::shared_ptr vk_context; typedef std::weak_ptr vk_context_ref; struct ggml_vk_garbage_collector { std::vector tl_semaphores; std::vector semaphores; std::vector events; std::vector contexts; }; static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx); static void ggml_vk_load_shaders(vk_device& device); static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx); static bool vk_memory_logger_enabled = false; #define VK_LOG_MEMORY(msg) if (vk_memory_logger_enabled) { std::cerr << "ggml_vulkan memory: " << msg << std::endl; } static std::string format_size(size_t size) { const size_t kib = 1024; const size_t mib = kib * 1024; const size_t gib = mib * 1024; std::ostringstream oss; oss << std::fixed << std::setprecision(2); if (size >= gib) { oss << static_cast(size) / gib << " GiB"; } else if (size >= mib) { oss << static_cast(size) / mib << " MiB"; } else if (size >= kib) { oss << static_cast(size) / kib << " KiB"; } else { oss << size << " B"; } return oss.str(); } class vk_memory_logger { public: vk_memory_logger(): total_device(0), total_host(0) {} void log_allocation(vk_buffer_ref buf_ref, size_t size); void log_deallocation(vk_buffer_ref buf_ref); private: std::map allocations; // Track allocations size_t total_device; size_t total_host; static std::mutex log_mutex; }; std::mutex vk_memory_logger::log_mutex; static bool vk_perf_logger_enabled = false; static bool vk_perf_logger_concurrent = false; static bool vk_enable_sync_logger = false; // number of calls between perf logger prints static uint32_t vk_perf_logger_frequency = 1; static std::string vk_pipeline_stats_filter; class vk_perf_logger { public: void print_timings(bool force = false) { if (timings.empty()) { return; } print_count++; if ((print_count % vk_perf_logger_frequency) != 0 && !force) { return; } print_count = 0; uint64_t total_all_op_times = 0; std::cerr << "----------------\nVulkan Timings:" << std::endl; for (const auto & t : timings) { uint64_t total_op_times = 0; for (const auto & time : t.second) { total_op_times += time; } std::cerr << t.first << ": " << t.second.size() << " x " << (total_op_times / t.second.size() / 1000.0) << " us = " << (total_op_times / 1000.0) << " us"; // If we have as many flops entries as timing entries for the op, then compute and log the flops/S. auto it = flops.find(t.first); if (it != flops.end() && (it->second).size() == t.second.size()) { uint64_t total_op_flops = 0; for (const auto & elem : it->second) { total_op_flops += elem; } std::cerr << " (" << (double(total_op_flops) / (1000.0 * 1000.0 * 1000.0)) / (double(total_op_times) / (1000.0 * 1000.0 * 1000.0)) << " GFLOPS/s)"; } total_all_op_times += total_op_times; std::cerr << std::endl; } if (timings.size() > 0) { std::cerr << "Total time: " << total_all_op_times / 1000.0 << " us." << std::endl; } timings.clear(); flops.clear(); } std::string get_node_fusion_name(const ggml_tensor * node, const char *fusion_name, uint64_t *n_flops) { *n_flops = 0; std::string fusion_str; if (fusion_name) { fusion_str = fusion_name + std::string(" "); } if (node->op == GGML_OP_UNARY) { return fusion_str + ggml_unary_op_name(ggml_get_unary_op(node)); } if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) { const uint64_t m = node->ne[0]; const uint64_t n = node->ne[1]; const uint64_t k = node->src[1]->ne[0]; const uint64_t batch = node->ne[2] * node->ne[3]; std::string name = ggml_op_name(node->op); if ((node->op == GGML_OP_MUL_MAT && n <= mul_mat_vec_max_cols) || (node->op == GGML_OP_MUL_MAT_ID && node->src[2]->ne[1] == 1)) { name += "_VEC"; } name += " "; name += ggml_type_name(node->src[0]->type); name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k); if (node->op == GGML_OP_MUL_MAT_ID) { name += " n_expert=" + std::to_string(node->src[0]->ne[2]); } if (batch > 1) { name += " batch=" + std::to_string(batch); } name = fusion_str + name; *n_flops = m * n * (k + (k - 1)) * batch; return name; } if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) { std::string name = ggml_op_name(node->op); ggml_tensor * knl = node->src[0]; uint64_t OW = node->ne[0]; uint64_t OH = node->ne[1]; uint64_t N = node->ne[3]; uint64_t Cout = node->ne[2]; uint64_t KW = knl->ne[0]; uint64_t KH = knl->ne[1]; uint64_t Cin = node->src[1]->ne[2]; // KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ uint64_t size_M = Cout; uint64_t size_K = Cin * KW * KH; uint64_t size_N = N * OW * OH; *n_flops = size_M * size_N * (size_K + (size_K - 1)); name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) + ", N=N*OW*OH=" + std::to_string(size_N); name = fusion_str + name; return name; } if (node->op == GGML_OP_RMS_NORM) { std::string name = ggml_op_name(node->op); name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")"; name = fusion_str + name; return name; } if (node->op == GGML_OP_FLASH_ATTN_EXT) { const ggml_tensor * dst = node; const ggml_tensor * q = node->src[0]; const ggml_tensor * k = node->src[1]; const ggml_tensor * v = node->src[2]; const ggml_tensor * m = node->src[3]; std::stringstream name; name << fusion_str; name << ggml_op_name(node->op) << " dst(" << dst->ne[0] << "," << dst->ne[1] << "," << dst->ne[2] << "," << dst->ne[3] << "), " << " q(" << q->ne[0] << "," << q->ne[1] << "," << q->ne[2] << "," << q->ne[3] << "), " << " k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " << " v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " << " m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")"; *n_flops = 2ull * q->ne[1] * q->ne[2] * (k->ne[0] + v->ne[0]) * k->ne[1] * q->ne[3]; return name.str(); } if (node->op == GGML_OP_TOP_K) { std::stringstream name; name << fusion_str; name << ggml_op_name(node->op) << " K=" << node->ne[0] << " (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")"; return name.str(); } return fusion_str + ggml_op_name(node->op); } void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) { uint64_t n_flops; std::string name = get_node_fusion_name(node, fusion_name, &n_flops); if (n_flops) { flops[name].push_back(n_flops); } timings[name].push_back(time); } void log_timing(const std::vector &nodes, const std::vector &names, uint64_t time) { uint64_t total_flops = 0; std::string name; for (size_t n = 0; n < nodes.size(); ++n) { uint64_t n_flops = 0; name += get_node_fusion_name(nodes[n], names[n], &n_flops); total_flops += n_flops; if (n != nodes.size() - 1) { name += ", "; } } if (total_flops) { flops[name].push_back(total_flops); } timings[name].push_back(time); } private: std::map> timings; std::map> flops; uint32_t print_count {}; }; struct ggml_backend_vk_context { std::string name; vk_device device; size_t semaphore_idx, event_idx; ggml_vk_garbage_collector gc; size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k, prealloc_size_add_rms_partials, prealloc_size_add_rms_partials_offset; vk_buffer prealloc_x, prealloc_y, prealloc_split_k, prealloc_add_rms_partials, sync_staging; vk::Fence fence, almost_ready_fence; bool submit_pending {}; bool almost_ready_fence_pending {}; // Set before op_add and unset after op_rms_norm to indicate that the add should // write partial sums to accumulate the square of the vector components bool do_add_rms_partials_offset_calculation; bool do_add_rms_partials; uint64_t last_total_mul_mat_bytes {}; // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert. vk_pipeline_struct * prealloc_y_last_pipeline_used {}; const ggml_tensor * prealloc_y_last_tensor_used {}; // Track which nodes have been used since the last sync, and whether they were written to std::vector unsynced_nodes_written; std::vector unsynced_nodes_read; // Track which prealloc buffers have pending reads that need to be synchronized. // These are checked before writing to the buffer (and call ggml_vk_sync_buffers if set), // and set to true after the buffer contents are consumed. bool prealloc_x_need_sync, prealloc_y_need_sync, prealloc_split_k_need_sync; vk_context_ref compute_ctx; vk_context_ref transfer_ctx; vk_semaphore transfer_semaphore; uint64_t transfer_semaphore_last_submitted {}; std::vector tensor_ctxs; std::vector descriptor_pools; std::vector descriptor_sets; uint32_t descriptor_set_idx {}; uint32_t pipeline_descriptor_set_requirements {}; vk_command_pool compute_cmd_pool; vk_command_pool transfer_cmd_pool; // number of additional consecutive nodes that are being fused with the // node currently being processed int num_additional_fused_ops {}; // Bitmask of which fused ops need to write an intermediate value to memory. // Bit 'i' means nodes[start_of_fusion + i] writes to memory. // If there's no fusion, bit 0 is still set. int fused_ops_write_mask {}; topk_moe_mode fused_topk_moe_mode {}; bool fused_topk_moe_scale {}; // for GGML_VK_PERF_LOGGER std::unique_ptr perf_logger; vk::QueryPool query_pool; std::vector query_fusion_names; std::vector query_fusion_node_count; std::vector query_nodes; std::vector query_node_idx; int32_t num_queries {}; int32_t query_idx {}; }; static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT static uint64_t vk_tensor_offset(const ggml_tensor * tensor) { if (tensor->view_src) { return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base; } return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; } static uint32_t get_misalign_bytes(const ggml_backend_vk_context * ctx, const ggml_tensor * t) { return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));; } template void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { GGML_UNUSED(p); GGML_UNUSED(src0); GGML_UNUSED(src1); GGML_UNUSED(src2); GGML_UNUSED(src3); GGML_UNUSED(dst); static_assert(!std::is_const::value, "unexpected type"); GGML_ASSERT(!src0 || get_misalign_bytes(ctx, src0) == 0); GGML_ASSERT(!src1 || get_misalign_bytes(ctx, src1) == 0); GGML_ASSERT(!src2 || get_misalign_bytes(ctx, src2) == 0); GGML_ASSERT(!src3 || get_misalign_bytes(ctx, src3) == 0); GGML_ASSERT(!dst || get_misalign_bytes(ctx, dst) == 0); } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_mat_vec_p021_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); p.b_offset = b_offset; p.d_offset = d_offset; GGML_UNUSED(src0); GGML_UNUSED(src2); GGML_UNUSED(src3); } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_mat_vec_nc_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); p.b_offset = b_offset; p.d_offset = d_offset; GGML_UNUSED(src0); GGML_UNUSED(src2); GGML_UNUSED(src3); } struct ggml_backend_vk_buffer_context { vk_device_ref device; vk_buffer dev_buffer; std::string name; ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) : device(device), dev_buffer(dev_buffer), name(name) { } ~ggml_backend_vk_buffer_context() { ggml_vk_destroy_buffer(dev_buffer); } }; void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) { if (!vk_memory_logger_enabled) { return; } std::lock_guard guard(log_mutex); vk_buffer buf = buf_ref.lock(); const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal); const std::string type = device ? "device" : "host"; allocations[buf->buffer] = size; total_device += device ? size : 0; total_host += device ? 0 : size; VK_LOG_MEMORY(buf->device->name << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host)); } void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) { if (buf_ref.expired() || buf_ref.lock()->size == 0 || !vk_memory_logger_enabled) { return; } std::lock_guard guard(log_mutex); vk_buffer buf = buf_ref.lock(); const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal); std::string type = device ? "device" : "host"; auto it = allocations.find(buf->buffer); total_device -= device ? it->second : 0; total_host -= device ? 0 : it->second; if (it != allocations.end()) { VK_LOG_MEMORY(buf->device->name << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host)); allocations.erase(it); } else { VK_LOG_MEMORY("ERROR " << buf->device->name << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer); } } struct vk_instance_t { vk::Instance instance; bool debug_utils_support = false; // VK_EXT_debug_utils enabled PFN_vkSetDebugUtilsObjectNameEXT pfn_vkSetDebugUtilsObjectNameEXT = {}; PFN_vkQueueBeginDebugUtilsLabelEXT pfn_vkQueueBeginDebugUtilsLabelEXT = {}; PFN_vkQueueEndDebugUtilsLabelEXT pfn_vkQueueEndDebugUtilsLabelEXT = {}; PFN_vkCmdBeginDebugUtilsLabelEXT pfn_vkCmdBeginDebugUtilsLabelEXT = {}; PFN_vkCmdEndDebugUtilsLabelEXT pfn_vkCmdEndDebugUtilsLabelEXT = {}; PFN_vkCmdInsertDebugUtilsLabelEXT pfn_vkCmdInsertDebugUtilsLabelEXT = {}; std::vector device_indices; std::vector device_supports_membudget; vk_device devices[GGML_VK_MAX_DEVICES]; }; static bool vk_instance_initialized = false; static vk_instance_t vk_instance; #ifdef GGML_VULKAN_CHECK_RESULTS static size_t vk_skip_checks; static size_t vk_output_tensor; static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name); static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx); static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx); #endif typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); static void ggml_backend_vk_free(ggml_backend_t backend); static VkDeviceSize ggml_vk_get_max_buffer_range(const ggml_backend_vk_context * ctx, const vk_buffer &buf, const VkDeviceSize offset) { const VkDeviceSize range = std::min(VkDeviceSize{buf->size - offset}, VkDeviceSize{ctx->device->properties.limits.maxStorageBufferRange}); return range; } // Wait for ctx->fence to be signaled. static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) { // Use waitForFences while most of the graph executes. Hopefully the CPU can sleep // during this wait. if (ctx->almost_ready_fence_pending) { VK_CHECK(ctx->device->device.waitForFences({ ctx->almost_ready_fence }, true, UINT64_MAX), "almost_ready_fence"); ctx->device->device.resetFences({ ctx->almost_ready_fence }); ctx->almost_ready_fence_pending = false; } // Spin (w/pause) waiting for the graph to finish executing. vk::Result result; while ((result = ctx->device->device.getFenceStatus(ctx->fence)) != vk::Result::eSuccess) { if (result != vk::Result::eNotReady) { fprintf(stderr, "ggml_vulkan: error %s at %s:%d\n", to_string(result).c_str(), __FILE__, __LINE__); exit(1); } for (uint32_t i = 0; i < 100; ++i) { YIELD(); YIELD(); YIELD(); YIELD(); YIELD(); YIELD(); YIELD(); YIELD(); YIELD(); YIELD(); } } ctx->device->device.resetFences({ ctx->fence }); } // variables to track number of compiles in progress static uint32_t compile_count = 0; static std::mutex compile_count_mutex; static std::condition_variable compile_count_cond; static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, std::array wg_denoms, std::vector specialization_constants, bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) { VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << pipeline->name << ", " << entrypoint << ", " << parameter_count << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")"); GGML_ASSERT(parameter_count > 0); GGML_ASSERT(parameter_count <= MAX_PARAMETER_COUNT); GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast(spv_data)); // Patch SPIR-V to enable RTE rounding for FP16, avoiding the need for // separate shader variants compiled with -DRTE16. std::vector spv; if (device->float_controls_rte_fp16) { const uint32_t* spv_words = reinterpret_cast(spv_data); size_t word_count = spv_size / sizeof(uint32_t); spv.assign(spv_words, spv_words + word_count); // Find insertion points respecting SPIR-V layout order: // Header(5) -> OpCapability -> OpExtension -> ... -> OpEntryPoint -> OpExecutionMode -> ... size_t pos = 5; // skip header size_t cap_insert_pos = pos; size_t ext_insert_pos = pos; size_t exec_insert_pos = pos; uint32_t entry_point_id = 0; while (pos < spv.size()) { uint32_t opcode = spv[pos] & spv::OpCodeMask; uint32_t len = spv[pos] >> spv::WordCountShift; if (len == 0) break; if (opcode == spv::OpCapability) { cap_insert_pos = pos + len; ext_insert_pos = pos + len; } else if (opcode == spv::OpExtension) { ext_insert_pos = pos + len; } else if (opcode == spv::OpEntryPoint) { entry_point_id = spv[pos + 2]; exec_insert_pos = pos + len; } else if (opcode == spv::OpExecutionMode || opcode == spv::OpExecutionModeId) { exec_insert_pos = pos + len; } else if (entry_point_id != 0) { break; } pos += len; } // Insert from latest position first so earlier indices stay valid. // OpExecutionMode %entrypoint RoundingModeRTE 16 uint32_t exec_mode[] = { (4u << spv::WordCountShift) | spv::OpExecutionMode, entry_point_id, spv::ExecutionModeRoundingModeRTE, 16 }; spv.insert(spv.begin() + exec_insert_pos, std::begin(exec_mode), std::end(exec_mode)); // OpExtension "SPV_KHR_float_controls" const char ext_str[] = "SPV_KHR_float_controls"; size_t ext_str_words = CEIL_DIV(sizeof(ext_str), sizeof(uint32_t)); std::vector extension(1 + ext_str_words, 0); extension[0] = (uint32_t)((1 + ext_str_words) << spv::WordCountShift) | spv::OpExtension; memcpy(&extension[1], ext_str, sizeof(ext_str)); spv.insert(spv.begin() + ext_insert_pos, extension.begin(), extension.end()); // OpCapability RoundingModeRTE uint32_t capability[] = { (2u << spv::WordCountShift) | spv::OpCapability, spv::CapabilityRoundingModeRTE }; spv.insert(spv.begin() + cap_insert_pos, std::begin(capability), std::end(capability)); shader_module_create_info = vk::ShaderModuleCreateInfo({}, spv.size() * sizeof(uint32_t), spv.data()); } pipeline->shader_module = device->device.createShaderModule(shader_module_create_info); vk::PushConstantRange pcr( vk::ShaderStageFlagBits::eCompute, 0, pipeline->push_constant_size ); vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), device->dsl, pcr); pipeline->layout = device->device.createPipelineLayout(pipeline_layout_create_info); std::vector specialization_entries(specialization_constants.size()); for (size_t i = 0; i < specialization_constants.size(); i++) { specialization_entries[i].constantID = i; specialization_entries[i].offset = i * sizeof(uint32_t); specialization_entries[i].size = sizeof(uint32_t); } vk::SpecializationInfo specialization_info( specialization_entries.size(), specialization_entries.data(), specialization_constants.size() * sizeof(uint32_t), specialization_constants.data() ); vk::PipelineShaderStageCreateFlags pipeline_shader_stage_create_flags{}; if (device->subgroup_require_full_support && require_full_subgroups) { pipeline_shader_stage_create_flags |= vk::PipelineShaderStageCreateFlagBits::eRequireFullSubgroupsEXT; } vk::PipelineShaderStageCreateInfo pipeline_shader_create_info( pipeline_shader_stage_create_flags, vk::ShaderStageFlagBits::eCompute, pipeline->shader_module, entrypoint.c_str(), &specialization_info); vk::PipelineShaderStageRequiredSubgroupSizeCreateInfoEXT pipeline_shader_stage_required_subgroup_size_create_info; pipeline_shader_stage_required_subgroup_size_create_info.requiredSubgroupSize = required_subgroup_size; if (device->subgroup_size_control && required_subgroup_size > 0) { GGML_ASSERT(device->subgroup_min_size <= required_subgroup_size && required_subgroup_size <= device->subgroup_max_size); pipeline_shader_create_info.setPNext(&pipeline_shader_stage_required_subgroup_size_create_info); } vk::ComputePipelineCreateInfo compute_pipeline_create_info( device->pipeline_executable_properties_support ? vk::PipelineCreateFlagBits::eCaptureStatisticsKHR : vk::PipelineCreateFlags{}, pipeline_shader_create_info, pipeline->layout); vk::PipelineRobustnessCreateInfoEXT rci; if (device->pipeline_robustness && disable_robustness) { rci.storageBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled; rci.uniformBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled; compute_pipeline_create_info.setPNext(&rci); } #if defined(VK_EXT_shader_64bit_indexing) vk::PipelineCreateFlags2CreateInfo pipelineFlags2CreateInfo; if (pipeline->is_64b_indexing) { pipelineFlags2CreateInfo.flags = vk::PipelineCreateFlagBits2::e64BitIndexingEXT; if (device->pipeline_executable_properties_support) { pipelineFlags2CreateInfo.flags |= vk::PipelineCreateFlagBits2::eCaptureStatisticsKHR; } pipelineFlags2CreateInfo.setPNext(compute_pipeline_create_info.pNext); compute_pipeline_create_info.setPNext(&pipelineFlags2CreateInfo); } #endif try { pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; } catch (const vk::SystemError& e) { std::cerr << "ggml_vulkan: Compute pipeline creation failed for " << pipeline->name << std::endl; std::cerr << "ggml_vulkan: " << e.what() << std::endl; throw e; } pipeline->compiled = true; if (vk_instance.debug_utils_support) { vk::DebugUtilsObjectNameInfoEXT duoni; duoni.objectType = vk::ObjectType::ePipeline; duoni.pObjectName = pipeline->name.c_str(); duoni.objectHandle = /*reinterpret_cast*/(uint64_t)(static_cast(pipeline->pipeline)); vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast(duoni)); } if (device->pipeline_executable_properties_support) { vk::PipelineExecutableInfoKHR executableInfo; executableInfo.pipeline = pipeline->pipeline; auto statistics = device->device.getPipelineExecutableStatisticsKHR(executableInfo); bool print_stats = !vk_pipeline_stats_filter.empty() && pipeline->name.find(vk_pipeline_stats_filter) != std::string::npos; if (print_stats) { std::cerr << "ggml_vulkan: pipeline stats for " << pipeline->name << ":" << std::endl; } for (auto & s : statistics) { if (print_stats) { std::cerr << "ggml_vulkan: " << s.name.data() << ": "; switch (s.format) { case vk::PipelineExecutableStatisticFormatKHR::eBool32: std::cerr << (s.value.b32 ? "true" : "false"); break; case vk::PipelineExecutableStatisticFormatKHR::eInt64: std::cerr << s.value.i64; break; case vk::PipelineExecutableStatisticFormatKHR::eUint64: std::cerr << s.value.u64; break; case vk::PipelineExecutableStatisticFormatKHR::eFloat64: std::cerr << s.value.f64; break; } std::cerr << std::endl; } // "Register Count" is reported by NVIDIA drivers. if (strcmp(s.name, "Register Count") == 0) { VK_LOG_DEBUG(pipeline->name << " " << s.name << ": " << s.value.u64 << " registers"); pipeline->register_count = (uint32_t)s.value.u64; } } } device->all_pipelines.push_back(pipeline); { std::lock_guard guard(compile_count_mutex); assert(compile_count > 0); compile_count--; } compile_count_cond.notify_all(); } static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) { VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")"); device.destroyPipelineLayout(pipeline->layout); device.destroyShaderModule(pipeline->shader_module); device.destroyPipeline(pipeline->pipeline); } static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n) { VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")"); ctx->pipeline_descriptor_set_requirements += n; if (!pipeline->compiled) { pipeline->needed = true; ggml_vk_load_shaders(ctx->device); } ggml_pipeline_allocate_descriptor_sets(ctx); } static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) { if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) { // Enough descriptors are available return; } vk_device& device = ctx->device; // Grow by 50% to avoid frequent allocations uint32_t needed = std::max(3 * ctx->descriptor_sets.size() / 2, size_t{ctx->pipeline_descriptor_set_requirements}); uint32_t to_alloc = needed - ctx->descriptor_sets.size(); uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE; uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE; while (to_alloc > 0) { const uint32_t alloc_count = std::min(pool_remaining, to_alloc); to_alloc -= alloc_count; pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE; if (pool_idx >= ctx->descriptor_pools.size()) { vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE); vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size); ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info)); } std::vector layouts(alloc_count); for (uint32_t i = 0; i < alloc_count; i++) { layouts[i] = device->dsl; } vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data()); std::vector sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info); ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end()); pool_idx++; } } static vk_command_buffer* ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) { VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()"); vk::CommandBufferAllocateInfo command_buffer_alloc_info( p.pool, vk::CommandBufferLevel::ePrimary, 1); const std::vector cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info); p.cmd_buffers.push_back({ cmd_buffers.front(), 0, true }); return &p.cmd_buffers[p.cmd_buffers.size()-1]; } static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) { if (ctx->seqs.empty()) { if (fence) { std::lock_guard guard(queue_mutex); ctx->p->q->queue.submit({}, fence); } return; } VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")"); std::vector> tl_wait_vals; std::vector> tl_signal_vals; std::vector> tl_wait_semaphores; std::vector> tl_signal_semaphores; std::vector tl_submit_infos; std::vector submit_infos; int idx = -1; std::vector> stage_flags; size_t reserve = 0; for (const auto& sequence : ctx->seqs) { reserve += sequence.size(); } // Pre-reserve vectors to prevent reallocation, which invalidates pointers tl_wait_semaphores.reserve(reserve); tl_wait_vals.reserve(reserve); tl_signal_semaphores.reserve(reserve); tl_signal_vals.reserve(reserve); tl_submit_infos.reserve(reserve); submit_infos.reserve(reserve); stage_flags.reserve(reserve); for (const auto& sequence : ctx->seqs) { for (const auto& submission : sequence) { stage_flags.push_back({}); idx++; tl_wait_vals.push_back({}); tl_wait_semaphores.push_back({}); tl_signal_vals.push_back({}); tl_signal_semaphores.push_back({}); for (size_t i = 0; i < submission.wait_semaphores.size(); i++) { stage_flags[idx].push_back(ctx->p->q->stage_flags); tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value); tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s); } for (size_t i = 0; i < submission.signal_semaphores.size(); i++) { tl_signal_vals[idx].push_back(submission.signal_semaphores[i].value); tl_signal_semaphores[idx].push_back(submission.signal_semaphores[i].s); } tl_submit_infos.push_back({ (uint32_t) submission.wait_semaphores.size(), tl_wait_vals[idx].data(), (uint32_t) submission.signal_semaphores.size(), tl_signal_vals[idx].data(), }); tl_submit_infos[idx].sType = vk::StructureType::eTimelineSemaphoreSubmitInfo; tl_submit_infos[idx].pNext = nullptr; vk::SubmitInfo si{ (uint32_t) submission.wait_semaphores.size(), tl_wait_semaphores[idx].data(), stage_flags[idx].data(), 1, &submission.buffer->buf, (uint32_t) submission.signal_semaphores.size(), tl_signal_semaphores[idx].data(), }; si.setPNext(&tl_submit_infos[idx]); submit_infos.push_back(si); } } std::lock_guard guard(queue_mutex); ctx->p->q->queue.submit(submit_infos, fence); ctx->seqs.clear(); } static uint32_t ggml_vk_find_queue_family_index(std::vector& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) { VK_LOG_DEBUG("ggml_vk_find_queue_family_index()"); const uint32_t qfsize = queue_family_props.size(); // Try with avoid preferences first for (uint32_t i = 0; i < qfsize; i++) { if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required && !(queue_family_props[i].queueFlags & avoid)) { return i; } } // Fall back to only required for (size_t i = 0; i < qfsize; i++) { if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required) { return i; } } // Fall back to reusing compute queue for (size_t i = 0; i < qfsize; i++) { if (queue_family_props[i].queueCount >= min_num_queues && queue_family_props[i].queueFlags & required) { return i; } } // Fall back to ignoring min_num_queries for (size_t i = 0; i < qfsize; i++) { if (queue_family_props[i].queueFlags & required) { return i; } } // All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations. // Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional. if (compute_index >= 0) { return compute_index; } std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl; for(auto &q_family : queue_family_props) { std::cerr << "Queue number: " + std::to_string(q_family.queueCount) << " flags: " + to_string(q_family.queueFlags) << std::endl; } abort(); } static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags, bool transfer_only) { VK_LOG_DEBUG("ggml_vk_create_queue()"); std::lock_guard guard(device->mutex); q.queue_family_index = queue_family_index; q.transfer_only = transfer_only; q.cmd_pool.init(device, &q); q.queue = device->device.getQueue(queue_family_index, queue_index); q.stage_flags = stage_flags; } static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p) { vk_context result = std::make_shared(); VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")"); ctx->gc.contexts.emplace_back(result); result->p = &p; return result; } static vk_context ggml_vk_create_temporary_context(vk_command_pool& p) { vk_context result = std::make_shared(); VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")"); result->p = &p; return result; } static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) { VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()"); vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 }; vk::SemaphoreCreateInfo ci{}; ci.setPNext(&tci); vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci); ctx->gc.semaphores.push_back({ semaphore, 0 }); return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1]; } static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) { VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()"); if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) { vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; vk::SemaphoreCreateInfo ci{}; ci.setPNext(&tci); vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci); ctx->gc.tl_semaphores.push_back({ semaphore, 0 }); } return &ctx->gc.tl_semaphores[ctx->semaphore_idx++]; } static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) { if (ctx->event_idx >= ctx->gc.events.size()) { ctx->gc.events.push_back(ctx->device->device.createEvent({})); } return ctx->gc.events[ctx->event_idx++]; } static void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) { VK_LOG_DEBUG("ggml_vk_command_pool_cleanup()"); // Requires command buffers to be done device->device.resetCommandPool(p.pool); // Don't clear the command buffers and mark them as not in use. // This allows us to reuse them for (auto& cmd_buffer : p.cmd_buffers) { cmd_buffer.in_use = false; } } static void ggml_vk_queue_command_pools_cleanup(vk_device& device) { VK_LOG_DEBUG("ggml_vk_queue_command_pools_cleanup()"); // Arbitrary frequency to cleanup/reuse command buffers static constexpr uint32_t cleanup_frequency = 10; if (device->compute_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) { ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool); } if (device->transfer_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) { ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool); } } static std::vector ggml_vk_find_memory_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) { std::vector indices; for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) { vk::MemoryType memory_type = mem_props->memoryTypes[i]; if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) && (flags & memory_type.propertyFlags) == flags && mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) { indices.push_back(i); } } return indices; } static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list & req_flags_list, void *import_ptr = nullptr) { VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")"); if (size > device->max_buffer_size) { throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device buffer size limit"); } vk_buffer buf = std::make_shared(); if (size == 0) { buf->size = 0; return buf; } vk::BufferUsageFlags usage_flags = vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst; vk::MemoryAllocateFlags mem_flags {}; if (device->buffer_device_address) { usage_flags |= vk::BufferUsageFlagBits::eShaderDeviceAddress; mem_flags |= vk::MemoryAllocateFlagBits::eDeviceAddress; } vk::BufferCreateInfo buffer_create_info{ vk::BufferCreateFlags(), size, usage_flags, vk::SharingMode::eExclusive, 0, nullptr, }; vk::ExternalMemoryBufferCreateInfo external_memory_bci; if (import_ptr) { external_memory_bci.handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT; buffer_create_info.setPNext(&external_memory_bci); } buf->buffer = device->device.createBuffer(buffer_create_info); vk::MemoryRequirements mem_req = device->device.getBufferMemoryRequirements(buf->buffer); vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties(); const vk::MemoryPriorityAllocateInfoEXT mem_priority_info { 1.0f }; vk::MemoryAllocateFlagsInfo mem_flags_info { mem_flags }; if (device->memory_priority) { mem_flags_info.setPNext(&mem_priority_info); } if (import_ptr) { vk::MemoryHostPointerPropertiesEXT host_pointer_props; try { host_pointer_props = device->device.getMemoryHostPointerPropertiesEXT(vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, import_ptr); } catch (vk::SystemError& e) { GGML_LOG_WARN("ggml_vulkan: Failed getMemoryHostPointerPropertiesEXT (%s)\n", e.what()); device->device.destroyBuffer(buf->buffer); return {}; } vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties(); uint32_t memory_type_idx; vk::MemoryPropertyFlags property_flags = *req_flags_list.begin(); for (memory_type_idx = 0; memory_type_idx < 32; ++memory_type_idx) { if (!(host_pointer_props.memoryTypeBits & (1u << memory_type_idx))) { continue; } if (!(mem_req.memoryTypeBits & (1u << memory_type_idx))) { continue; } vk::MemoryType memory_type = mem_props.memoryTypes[memory_type_idx]; // check for visible+coherent+cached. Other flags (e.g. devicelocal) are allowed if ((memory_type.propertyFlags & property_flags) == property_flags) { property_flags = memory_type.propertyFlags; break; } } if (memory_type_idx == 32) { GGML_LOG_WARN("ggml_vulkan: Memory type for host allocation not found\n"); device->device.destroyBuffer(buf->buffer); return {}; } buf->memory_property_flags = mem_props.memoryTypes[memory_type_idx].propertyFlags; try { vk::ImportMemoryHostPointerInfoEXT import_info; import_info.handleType = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT; import_info.pHostPointer = import_ptr; import_info.setPNext(&mem_flags_info); buf->device_memory = device->device.allocateMemory({ size, memory_type_idx, &import_info }); } catch (const vk::SystemError& e) { } } else { for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) { const auto & req_flags = *it; const std::vector memory_type_indices = ggml_vk_find_memory_properties(&mem_props, &mem_req, req_flags); if (memory_type_indices.empty()) { continue; } buf->memory_property_flags = req_flags; bool done = false; for (auto mtype_it = memory_type_indices.begin(); mtype_it != memory_type_indices.end(); mtype_it++) { try { buf->device_memory = device->device.allocateMemory({ mem_req.size, *mtype_it, &mem_flags_info }); done = true; break; } catch (const vk::SystemError& e) { // loop and retry // during last attempt throw the exception if (it + 1 == req_flags_list.end() && mtype_it + 1 == memory_type_indices.end()) { device->device.destroyBuffer(buf->buffer); throw e; } } } if (done) { break; } } } if (!buf->device_memory) { device->device.destroyBuffer(buf->buffer); throw vk::OutOfDeviceMemoryError("No suitable memory type found"); } buf->ptr = nullptr; if (import_ptr) { buf->ptr = import_ptr; } else { if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { buf->ptr = device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE); } } device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0); buf->device = device; buf->size = size; if (device->buffer_device_address) { const vk::BufferDeviceAddressInfo addressInfo(buf->buffer); buf->bda_addr = device->device.getBufferAddress(addressInfo); } device->memory_logger->log_allocation(buf, size); return buf; } static vk_buffer ggml_vk_create_buffer_check(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) { try { return ggml_vk_create_buffer(device, size, {req_flags, fallback_flags}); } catch (const vk::SystemError& e) { std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl; std::cerr << "ggml_vulkan: " << e.what() << std::endl; throw e; } } static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) { vk_buffer buf; try { if (device->prefer_host_memory) { buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal}); } else if (device->uma) { // Fall back to host memory type buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); } else if (device->disable_host_visible_vidmem) { if (device->allow_sysmem_fallback) { buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); } else { buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal}); } } else { // use rebar if available, otherwise fallback to device only visible memory if (device->allow_sysmem_fallback) { buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); } else { buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal}); } } } catch (const vk::SystemError& e) { std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl; std::cerr << "ggml_vulkan: " << e.what() << std::endl; throw e; } return buf; } static void ggml_vk_destroy_buffer(vk_buffer& buf) { if (buf == nullptr) { return; } if (buf->device != nullptr) { buf->device->memory_logger->log_deallocation(buf); } buf.reset(); } static vk_subbuffer ggml_vk_subbuffer(const ggml_backend_vk_context* ctx, const vk_buffer& buf, size_t offset = 0) { return { buf, offset, ggml_vk_get_max_buffer_range(ctx, buf, offset) }; } static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx) { VK_LOG_DEBUG("ggml_vk_sync_buffers()"); const bool transfer_queue = subctx->p->q->transfer_only; if (ctx) { ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false; } subctx->s->buffer->buf.pipelineBarrier( subctx->p->q->stage_flags, subctx->p->q->stage_flags, {}, { { { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) }, { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) } } }, {}, {} ); } static void ggml_vk_reset_event(vk_context& ctx, vk::Event& event) { VK_LOG_DEBUG("ggml_vk_set_event()"); ctx->s->buffer->buf.resetEvent( event, ctx->p->q->stage_flags ); } static void ggml_vk_set_event(vk_context& ctx, vk::Event& event) { VK_LOG_DEBUG("ggml_vk_set_event()"); ctx->s->buffer->buf.setEvent( event, ctx->p->q->stage_flags ); } static void ggml_vk_wait_events(vk_context& ctx, std::vector&& events) { VK_LOG_DEBUG("ggml_vk_wait_events()"); if (events.empty()) { return; } ctx->s->buffer->buf.waitEvents( events, ctx->p->q->stage_flags, ctx->p->q->stage_flags, {}, {}, {} ); } struct vk_fa_tuning_params { FaCodePath path; uint32_t workgroup_size; uint32_t subgroup_size; uint32_t block_rows; uint32_t block_cols; uint32_t d_split; uint32_t row_split; bool shmem_staging; bool disable_subgroups; uint32_t limit_occupancy_shmem; void print() const { std::cerr << "path=" << path << " workgroup_size=" << workgroup_size << " subgroup_size=" << subgroup_size << " block_rows=" << block_rows << " block_cols=" << block_cols << " d_split=" << d_split << " row_split=" << row_split << " shmem_staging=" << shmem_staging << " disable_subgroups=" << disable_subgroups << " limit_occupancy_shmem=" << limit_occupancy_shmem << std::endl; } }; static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type kv_type); static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc); static vk_fa_tuning_params get_fa_tuning_params_scalar(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type kv_type, bool f32acc) { vk_fa_tuning_params result{}; result.path = FA_SCALAR; if (device->vendor_id == VK_VENDOR_ID_INTEL) { // Disable subgroup use due to performance issues when enforcing subgroup sizes result.subgroup_size = 32; result.disable_subgroups = true; } else if (device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN) { result.subgroup_size = n_rows < 4 ? 32 : device->subgroup_size; } else { result.subgroup_size = device->subgroup_size; } // Row split splits the workgroup so that synchronization only has to happen within subgroups, which avoids barriers uint32_t row_split_max_hsk = 64; if (device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN && !device->uma) { row_split_max_hsk = n_rows <= 8 ? 64 : 128; } result.row_split = (n_rows < 4 || hsk <= row_split_max_hsk) ? 1 : 4; if (result.subgroup_size > 32 && (n_rows < 4 || hsk < (result.row_split == 1 ? 128 : 64))) { result.workgroup_size = result.subgroup_size * 2; } else { result.workgroup_size = result.subgroup_size * 4; } const uint32_t D = hsk | hsv; const bool reduce_block_rows = D & 8 || n_kv < 1024 || device->vendor_id == VK_VENDOR_ID_INTEL; if (n_rows == 1) { result.block_rows = 1; result.block_cols = 64; } else { // row_split 1 means higher register use per row, so block size has to be adjusted if (result.row_split == 1) { result.block_rows = n_rows == 2 ? 2 : ((n_rows <= 4 || reduce_block_rows) ? 4 : 8); } else { result.block_rows = n_rows <= 4 ? 4 : ((n_rows <= 8 || reduce_block_rows) ? 8 : 16); } result.block_cols = (D & 8) ? 64 : 32; } const uint32_t D_lsb = D ^ (D & (D-1)); // extract lowest set bit result.d_split = std::min(std::min(result.subgroup_size, 8u), D_lsb / 4); result.shmem_staging = (device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256 && hsv < 256) ? 1 : 0; if (!reduce_block_rows && !ggml_vk_flash_attn_scalar_shmem_support(device, result, hsk, hsv, f32acc, kv_type)) { result.block_rows /= 2; } // On AMD RDNA, for small head sizes and big batch size the shader uses few registers, so too many subgroups get scheduled // at once and end up thrashing the cache. Fix this by setting a large (unused) shmem buffer that reduces occupancy. // This targets an occupancy of 4 subgroups per SIMD. if (device->vendor_id == VK_VENDOR_ID_AMD && device->properties.limits.maxComputeSharedMemorySize == 65536) { if (device->architecture != AMD_GCN && n_rows >= 64 && hsk <= 128) { // 30kb target for hsk > 64, 26kb for <= 64 due to smaller workgroup size // Values are guessed, tested on RDNA2 result.limit_occupancy_shmem = (hsk <= 64 ? 26 : 30) * 1024 / 4 / 4; } else if (device->architecture == AMD_GCN && n_rows <= 8 && hsk >= 256) { // Same thing for GCN, with an occupancy target of 2 subgroups per SIMD. // Here low-batch FA with large head size is affected. // n_rows < 4 switch because workgroup size switches from 128 to 256 there. result.limit_occupancy_shmem = (n_rows < 4 ? 14 : 26) * 1024 / 4 / 4; } } return result; } static vk_fa_tuning_params get_fa_tuning_params_coopmat1(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type kv_type, bool f32acc) { GGML_UNUSED(n_rows); GGML_UNUSED(n_kv); GGML_UNUSED(kv_type); GGML_UNUSED(f32acc); vk_fa_tuning_params result{}; result.path = FA_COOPMAT1; const uint32_t D = hsk | hsv; const uint32_t coopmat_block_rows = 16; const uint32_t coopmat_block_cols = 16; const uint32_t num_subgroups = 4; result.block_rows = coopmat_block_rows; result.block_cols = coopmat_block_cols * num_subgroups; result.row_split = num_subgroups; result.subgroup_size = device->subgroup_size; result.workgroup_size = num_subgroups * result.subgroup_size; const uint32_t D_lsb = D ^ (D & (D-1)); // extract lowest set bit result.d_split = std::min(std::min(result.subgroup_size, 8u), D_lsb / 4); result.shmem_staging = (device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256 && hsv < 256) ? 1 : 0; return result; } static vk_fa_tuning_params get_fa_tuning_params_coopmat2(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc) { GGML_UNUSED(n_kv); GGML_UNUSED(f32acc); vk_fa_tuning_params result{}; result.path = FA_COOPMAT2; const uint32_t D = hsk | hsv; const bool small_rows = n_rows < 32; if (small_rows) { result.block_rows = 32; result.block_cols = 32; } else if (ggml_is_quantized(k_type) || ggml_is_quantized(v_type) || hsk >= 256 || hsv >= 256) { result.block_rows = (hsk >= 512 || hsv >= 512) ? 32 : 64; result.block_cols = 32; } else { result.block_rows = 64; result.block_cols = 64; } result.subgroup_size = device->subgroup_size; result.workgroup_size = (small_rows && (D % 32) == 0) ? 256 : 128; return result; } static vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc) { // Mixed K/V is only implemented on the coopmat2 (flash_attn_cm2) path; never use scalar/cm1. if (k_type != v_type) { GGML_ASSERT(device->coopmat2); return get_fa_tuning_params_coopmat2(device, hsk, hsv, n_rows, n_kv, k_type, v_type, f32acc); } FaCodePath path = device->coopmat2 ? FA_COOPMAT2 : device->coopmat1_fa_support ? FA_COOPMAT1 : FA_SCALAR; if (path == FA_COOPMAT1 && device->architecture == vk_device_architecture::NVIDIA_TURING) { // Nvidia compiler bug, see https://github.com/ggml-org/llama.cpp/pull/19075#issuecomment-3820716090 path = FA_SCALAR; } if (path == FA_COOPMAT1) { bool shape_ok = (f32acc && device->coopmat_support_16x16x16_f32acc) || (!f32acc && device->coopmat_support_16x16x16_f16acc); const vk_fa_tuning_params params = get_fa_tuning_params_coopmat1(device, hsk, hsv, n_rows, n_kv, k_type, f32acc); bool shmem_ok = ggml_vk_flash_attn_coopmat_shmem_support(device, params, hsk, hsv, f32acc); if (!shape_ok || !shmem_ok) { path = FA_SCALAR; } } // scalar is faster than coopmat when N==1 if (n_rows == 1 && (path == FA_COOPMAT1 || path == FA_COOPMAT2)) { path = FA_SCALAR; } // Q1_0 K/V is only implemented on coopmat2 (flash_attn_cm2); there is no scalar FA shader for it. if ((k_type == GGML_TYPE_Q1_0 || v_type == GGML_TYPE_Q1_0) && device->coopmat2) { path = FA_COOPMAT2; } switch (path) { case FA_SCALAR: return get_fa_tuning_params_scalar(device, hsk, hsv, n_rows, n_kv, k_type, f32acc); case FA_COOPMAT1: return get_fa_tuning_params_coopmat1(device, hsk, hsv, n_rows, n_kv, k_type, f32acc); case FA_COOPMAT2: return get_fa_tuning_params_coopmat2(device, hsk, hsv, n_rows, n_kv, k_type, v_type, f32acc); default: throw std::runtime_error("unsupported FaCodePath"); } } static vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc, bool use_mask, bool use_mask_opt, bool use_logit_softcap, ggml_type k_type, ggml_type v_type) { const bool old_amd_windows = device->vendor_id == VK_VENDOR_ID_AMD && device->driver_id == vk::DriverId::eAmdProprietary && (device->architecture == AMD_GCN || device->architecture == AMD_RDNA1 || device->architecture == AMD_RDNA2); uint32_t flags = (use_mask_opt ? 1 : 0) | (use_mask ? 2 : 0) | (use_logit_softcap ? 4 : 0) | (old_amd_windows ? 8 : 0); const uint32_t subgroup_size = params.disable_subgroups ? 0 : params.subgroup_size; return vk_fa_pipeline_state{hsk, hsv, params.block_rows, params.block_cols, params.d_split, params.row_split, params.shmem_staging, params.path, params.workgroup_size, subgroup_size, aligned, f32acc, flags, params.limit_occupancy_shmem, k_type, v_type}; } static std::vector get_fa_spec_constants(const vk_fa_pipeline_state& state) { const auto fa_block_bytes = [](ggml_type t) -> uint32_t { // decodeBufF32 uses a block of vec4s for a better memory access pattern. return t == GGML_TYPE_F32 ? 16u : (uint32_t) ggml_type_size(t); }; return { /* 0 WorkGroupSize */ state.workgroup_size, /* 1 Br */ state.Br, /* 2 Bc */ state.Bc, /* 3 HSK */ state.HSK, /* 4 HSV */ state.HSV, /* 5 Clamp */ static_cast(!state.aligned), /* 6 D_split */ state.D_split, /* 7 row_split */ state.row_split, /* 8 SubGroupSize */ state.subgroup_size, /* 9 SHMEM_STAGING */ state.shmem_staging ? 1u : 0u, /*10 Flags */ state.flags, /*11 LIMIT_OCCUPANCY_SHMEM */ state.limit_occupancy_shmem, /*12 FaTypeK */ static_cast(state.k_type), /*13 FaTypeV */ static_cast(state.v_type), /*14 FaBlockBytesK */ fa_block_bytes(state.k_type), /*15 FaBlockBytesV */ fa_block_bytes(state.v_type), }; } static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector& warptile, bool mul_mat_id, ggml_type src0_type) { uint32_t lut_size = 0; switch (src0_type) { case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_M: lut_size = 2*2048 + 4*2048; break; case GGML_TYPE_IQ2_XXS: lut_size = 8*256; break; case GGML_TYPE_IQ2_XS: lut_size = 8*512; break; case GGML_TYPE_IQ2_S: lut_size = 8*1024; break; case GGML_TYPE_IQ3_XXS: lut_size = 4*256; break; case GGML_TYPE_IQ3_S: lut_size = 4*512; break; case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_MXFP4: lut_size = 4*16; break; case GGML_TYPE_NVFP4: // Same kvalues budget as MXFP4 plus ue4m3_fp32_lut[128] (types.glsl, DATA_A_NVFP4). lut_size = 4*16 + 128u * (uint32_t)sizeof(float); break; default: break; } // Needs to be kept up to date on shader changes const uint32_t bank_conflict_offset = device->coopmat_support ? 8 : 1; const uint32_t type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float); const uint32_t warps = warptile[0] / warptile[10]; const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size; const uint32_t mmid_row_ids = mul_mat_id ? (warptile[2] * 2 * sizeof(uint16_t)) : 0; const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0; const uint32_t ballots_sh = mul_mat_id ? (warps * 4 * sizeof(uint32_t)) : 0; const uint32_t total_size = load_bufs + mmid_row_ids + coopmat_stage + lut_size + ballots_sh; const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize; VK_LOG_DEBUG("ggml_vk_matmul_shmem_support(warptile=(" << warptile[0] << "," << warptile[1] << "," << warptile[2] << "), " "mul_mat_id=" << mul_mat_id << ", src0_type=" << ggml_type_name(src0_type) << ", supported=" << supported); return supported; } struct GpuPipelineConfig { // GPU architecture identifier. // Example: vk_device_architecture::AMD_GCN vk_device_architecture arch; // Mapping of pipeline names to their specific subgroup sizes. // Example: {"soft_max_f32", 64} std::unordered_map pipelines; // Default subgroup size for this GPU. // Defaults to 0 if not explicitly provided. uint32_t default_subgroup_size = 0; }; // Pipeline configuration for RDNA1 GPUs. static const std::unordered_map rdna1_pipelines = { {"soft_max", 64}, {"im2col", 64}, {"argmax", 64}, {"mul_mat_vec", 64}, {"mul_mat_vec_f16", 32}, {"mul_mat_vec_f32_f16", 32} }; // Pipeline configuration for RDNA2 GPUs. static const std::unordered_map rdna2_pipelines = { {"soft_max", 64}, {"im2col", 64}, }; static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32; // Define configurations for different GPUs. static std::vector gpu_pipeline_configs = { { vk_device_architecture::AMD_RDNA1, { rdna1_pipelines, }, RDNA_DEFAULT_SUBGROUP_SIZE }, { vk_device_architecture::AMD_RDNA2, { rdna2_pipelines, }, RDNA_DEFAULT_SUBGROUP_SIZE }, }; static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch) { for (const auto &config : gpu_pipeline_configs) { if (config.arch == arch) { auto pipIt = config.pipelines.find(pipeline_name); if (pipIt != config.pipelines.end()) { return pipIt->second; } std::vector> sorted_pipelines(config.pipelines.begin(), config.pipelines.end()); std::sort(sorted_pipelines.begin(), sorted_pipelines.end(), [](const auto &a, const auto &b) { return a.first.size() > b.first.size(); }); for (const auto &entry : sorted_pipelines) { if (pipeline_name.find(entry.first) != std::string::npos) { return entry.second; } } return config.default_subgroup_size; } } return 0; // If no matching configuration is found } static void ggml_vk_load_shaders(vk_device& device) { VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); std::lock_guard guard(device->mutex); // some shaders have a minimum subgroup size const uint32_t subgroup_size_8 = std::max(device->subgroup_size, 8u); const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u); const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u); const uint32_t mul_mat_subgroup_size = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size; const uint32_t mul_mat_subgroup_size_8 = std::max(mul_mat_subgroup_size, 8u); const uint32_t mul_mat_subgroup_size_16 = std::max(mul_mat_subgroup_size, 16u); const uint32_t mul_mat_subgroup_size_32 = std::max(mul_mat_subgroup_size, 32u); const bool subgroup_min_size_16 = (!device->subgroup_size_control && device->subgroup_size >= 16) || (device->subgroup_size_control && device->subgroup_max_size >= 16); // mulmat std::vector l_warptile, m_warptile, s_warptile, l_warptile_id, m_warptile_id, s_warptile_id, l_warptile_mmq, m_warptile_mmq, s_warptile_mmq, l_warptile_mmq_int, m_warptile_mmq_int, s_warptile_mmq_int, l_warptile_mmq_int_k, m_warptile_mmq_int_k, s_warptile_mmq_int_k, l_warptile_mmq_k, m_warptile_mmq_k, s_warptile_mmq_k, l_warptile_mmqid, m_warptile_mmqid, s_warptile_mmqid, l_warptile_mmqid_int, m_warptile_mmqid_int, s_warptile_mmqid_int, l_warptile_mmqid_int_k, m_warptile_mmqid_int_k, s_warptile_mmqid_int_k; std::array l_wg_denoms, m_wg_denoms, s_wg_denoms, l_mmq_wg_denoms, m_mmq_wg_denoms, s_mmq_wg_denoms, l_mmq_wg_denoms_k, m_mmq_wg_denoms_k, s_mmq_wg_denoms_k, l_mmqid_wg_denoms, m_mmqid_wg_denoms, s_mmqid_wg_denoms; uint32_t l_align, m_align, s_align; if (device->coopmat2) { // spec constants and tile sizes for non-quant matmul/matmul_id l_warptile = { 256, 128, 256, 64, 1 }; m_warptile = { 256, 128, 128, 64, 0 }; s_warptile = { 128, 64, 64, 64, 0 }; l_wg_denoms = {128, 256, 1 }; m_wg_denoms = {128, 128, 1 }; s_wg_denoms = { 64, 64, 1 }; // spec constants and tile sizes for quant matmul (non-Qi_K) l_warptile_mmq = { 256, 128, 256, 64, 1 }; m_warptile_mmq = { 256, 128, 128, 64, 1 }; s_warptile_mmq = { 256, 32, 64, 128, 0 }; l_mmq_wg_denoms = { 128, 256, 1 }; m_mmq_wg_denoms = { 128, 128, 1 }; s_mmq_wg_denoms = { 32, 64, 1 }; // spec constants and tile sizes for quant matmul (Qi_K) l_warptile_mmq_k = { 256, 128, 256, 64, 1 }; m_warptile_mmq_k = { 256, 128, 128, 64, 1 }; s_warptile_mmq_k = { 256, 32, 64, 128, 0 }; l_mmq_wg_denoms_k = { 128, 256, 1 }; m_mmq_wg_denoms_k = { 128, 128, 1 }; s_mmq_wg_denoms_k = { 32, 64, 1 }; // spec constants and tile sizes for quant matmul_id l_warptile_mmqid = { 256, 128, 128, 32, 1, device->subgroup_size }; m_warptile_mmqid = { 256, 128, 64, 32, 0, device->subgroup_size }; s_warptile_mmqid = { 256, 128, 64, 32, 0, device->subgroup_size }; l_mmqid_wg_denoms = { 128, 128, 1 }; m_mmqid_wg_denoms = { 128, 64, 1 }; s_mmqid_wg_denoms = { 128, 64, 1 }; l_align = 128; m_align = 64; s_align = 32; } else { // Matrix cores require different warp group sizes const uint32_t tm_l = device->coopmat_support ? device->coopmat_m : 4; const uint32_t tm_m = device->coopmat_support ? device->coopmat_m : 4; const uint32_t tm_s = device->coopmat_support ? device->coopmat_m : 2; const uint32_t tn_l = device->coopmat_support ? device->coopmat_n : 4; const uint32_t tn_m = device->coopmat_support ? device->coopmat_n : 2; const uint32_t tn_s = device->coopmat_support ? device->coopmat_n : 2; const uint32_t tk_l = device->coopmat_support ? device->coopmat_k : 1; const uint32_t tk_m = device->coopmat_support ? device->coopmat_k : 1; const uint32_t tk_s = device->coopmat_support ? device->coopmat_k : 1; const uint32_t s_warptile_wm = device->subgroup_size == 8 ? 8 : 32; l_warptile = { 128, 128, 128, 16, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 }; m_warptile = { 128, 64, 64, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; s_warptile = { subgroup_size_32, 32, 32, 16, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 }; l_warptile_mmq = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 }; m_warptile_mmq = { 128, 64, 64, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; s_warptile_mmq = { subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 }; // Integer MMQ has a smaller shared memory profile, but heavier register use l_warptile_mmq_int = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 2, 4, 4, 1, subgroup_size_8 }; m_warptile_mmq_int = { 128, 64, 64, 32, subgroup_size_8, 32, 2, 2, 2, 1, subgroup_size_8 }; s_warptile_mmq_int = { subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, 2, 1, 1, subgroup_size_8 }; // K-quants use even more registers, mitigate by setting WMITER to 1 l_warptile_mmq_int_k = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 1, 4, 4, 1, subgroup_size_8 }; m_warptile_mmq_int_k = { 128, 64, 64, 32, subgroup_size_8, 32, 1, 2, 2, 1, subgroup_size_8 }; s_warptile_mmq_int_k = { subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 1, 2, 1, 1, subgroup_size_8 }; l_warptile_id = { 128, 128, 128, 16, mul_mat_subgroup_size_16 * 2, 64, 2, tm_l, tn_l, tk_l, mul_mat_subgroup_size_16 }; m_warptile_id = { 128, 64, 64, 16, mul_mat_subgroup_size_16, 32, 2, tm_m, tn_m, tk_m, mul_mat_subgroup_size_16 }; s_warptile_id = { mul_mat_subgroup_size_16, 32, 32, 16, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, mul_mat_subgroup_size_16 }; l_warptile_mmqid = { 128, 128, 128, 32, mul_mat_subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, mul_mat_subgroup_size_8 }; m_warptile_mmqid = { 128, 64, 64, 32, mul_mat_subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, mul_mat_subgroup_size_8 }; s_warptile_mmqid = { mul_mat_subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, mul_mat_subgroup_size_8 }; l_warptile_mmqid_int = { 128, 128, 128, 32, mul_mat_subgroup_size_8 * 2, 64, 2, 4, 4, 1, mul_mat_subgroup_size_8 }; m_warptile_mmqid_int = { 128, 64, 64, 32, mul_mat_subgroup_size_8, 32, 2, 2, 2, 1, mul_mat_subgroup_size_8 }; s_warptile_mmqid_int = { mul_mat_subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, 2, 1, 1, mul_mat_subgroup_size_8 }; l_warptile_mmqid_int_k = { 128, 128, 128, 32, mul_mat_subgroup_size_16 * 2, 64, 1, 4, 4, 1, mul_mat_subgroup_size_16 }; m_warptile_mmqid_int_k = { 128, 64, 64, 32, mul_mat_subgroup_size_16, 32, 1, 2, 2, 1, mul_mat_subgroup_size_16 }; s_warptile_mmqid_int_k = { mul_mat_subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 1, 2, 1, 1, mul_mat_subgroup_size_16 }; // chip specific tuning if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) { m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 }; m_warptile_mmqid = m_warptile_mmqid_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 }; } else if (device->vendor_id == VK_VENDOR_ID_AMD && device->coopmat_support && device->driver_id != vk::DriverId::eAmdProprietary) { // This is intentionally using tx_m values, slight performance increase l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile_mmq_int_k = { 256, 128, 128, 32, subgroup_size_16, 64, 1, 4, 2, 1, subgroup_size_16 }; } else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2) { // Xe2/Xe3 with coopmat enabled - warptile performance tuning l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; } l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 }; l_align = 128; m_align = 64; s_align = 32; for (uint32_t i = 0; i < GGML_TYPE_COUNT; ++i) { ggml_type t = (ggml_type)i; // Disable medium and large matrix multiplication if not enough shared memory is available // Check mmq warptiles as the largest configuration // Throw an error if not enough for any matrix multiplication is available if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, false, t)) { std::cerr << "ggml_vulkan: Error: Shared memory size too small for matrix multiplication." << std::endl; throw std::runtime_error("Shared memory size too small for matrix multiplication."); } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, false, t)) { device->mul_mat_m[i] = false; device->mul_mat_l[i] = false; } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, false, t)) { device->mul_mat_l[i] = false; } // Disable mul_mat_id if not enough shared memory is available if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmqid, true, t)) { device->mul_mat_id_s[i] = false; device->mul_mat_id_m[i] = false; device->mul_mat_id_l[i] = false; } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmqid, true, t)) { device->mul_mat_id_m[i] = false; device->mul_mat_id_l[i] = false; } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmqid, true, t)) { device->mul_mat_id_l[i] = false; } } } if (!device->pipeline_matmul_f32) { device->pipeline_matmul_f32 = std::make_shared(); } if (!device->pipeline_matmul_f32_f16) { device->pipeline_matmul_f32_f16 = std::make_shared(); } if (!device->pipeline_matmul_id_f32) { device->pipeline_matmul_id_f32 = std::make_shared(); } if (!device->pipeline_matmul_bf16) { device->pipeline_matmul_bf16 = std::make_shared(); } if (!device->pipeline_matmul_id_bf16) { device->pipeline_matmul_id_bf16 = std::make_shared(); } std::vector> compiles; auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& base_pipeline, const char *name, size_t spv_size, const void* spv_data, const char *entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) { if (!require_full_subgroups && required_subgroup_size == 0) { required_subgroup_size = get_subgroup_size(name, device->architecture); } vk_pipeline *ptr = &base_pipeline; int num_pipelines = 1; #if defined(VK_EXT_shader_64bit_indexing) if (device->shader_64b_indexing) { num_pipelines = 2; } #endif for (int i = 0; i < num_pipelines; ++i, ptr = &(*ptr)->next) { vk_pipeline &pipeline = *ptr; if (!pipeline) { pipeline = std::make_shared(); } if (!pipeline->initialized) { pipeline->name = name; pipeline->parameter_count = parameter_count; pipeline->push_constant_size = push_constant_size; pipeline->wg_denoms = wg_denoms; pipeline->align = align; pipeline->initialized = true; #if defined(VK_EXT_shader_64bit_indexing) pipeline->is_64b_indexing = (i == 1); #endif } if (!pipeline->needed || pipeline->compiled) { continue; } // TODO: We're no longer benefitting from the async compiles (shaders are // compiled individually, as needed) and this complexity can be removed. { // wait until fewer than N compiles are in progress uint32_t N = std::max(1u, std::thread::hardware_concurrency()); std::unique_lock guard(compile_count_mutex); while (compile_count >= N) { compile_count_cond.wait(guard); } compile_count++; } compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint, parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size)); } }; auto const &ggml_vk_create_pipeline2 = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const char *entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) { return ggml_vk_create_pipeline(device, pipeline, name.c_str(), spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness, require_full_subgroups, required_subgroup_size); }; #define CREATE_FA(TYPE, NAMELC, FAPATH, SUFFIX) \ for (auto &fa : device->pipeline_flash_attn_f32_f16[TYPE]) { \ FaCodePath path = fa.first.path; \ uint32_t Br = fa.first.Br; \ uint32_t Bc = fa.first.Bc; \ bool aligned = fa.first.aligned; \ bool f32acc = fa.first.f32acc; \ uint32_t fa_sgs = fa.first.subgroup_size; \ bool fa_ds = fa.first.subgroup_size == 0; \ if (path == FAPATH) { \ if (aligned) { \ if (f32acc) { \ ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, get_fa_spec_constants(fa.first), Bc, true, (!fa_ds && (FAPATH!=FA_COOPMAT2)), ((!fa_ds && (FAPATH!=FA_COOPMAT2)) ? fa_sgs : 0)); \ } else { \ ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, get_fa_spec_constants(fa.first), Bc, true, (!fa_ds && (FAPATH!=FA_COOPMAT2)), ((!fa_ds && (FAPATH!=FA_COOPMAT2)) ? fa_sgs : 0)); \ } \ } else { \ if (f32acc) { \ ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, get_fa_spec_constants(fa.first), 1, true, (!fa_ds && (FAPATH!=FA_COOPMAT2)), ((!fa_ds && (FAPATH!=FA_COOPMAT2)) ? fa_sgs : 0)); \ } else { \ ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, get_fa_spec_constants(fa.first), 1, true, (!fa_ds && (FAPATH!=FA_COOPMAT2)), ((!fa_ds && (FAPATH!=FA_COOPMAT2)) ? fa_sgs : 0)); \ } \ } \ } \ } if (device->fp16) { CREATE_FA(GGML_TYPE_F32, f32, FA_SCALAR, ) CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, ) #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (device->integer_dot_product && device->subgroup_clustered) { CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, _int8) CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_SCALAR, _int8) CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_SCALAR, _int8) CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_SCALAR, _int8) CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_SCALAR, _int8) CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_SCALAR, _int8) } else #endif { CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, ) CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_SCALAR, ) CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_SCALAR, ) CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_SCALAR, ) CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_SCALAR, ) CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_SCALAR, ) } } else { CREATE_FA(GGML_TYPE_F32, f32, FA_SCALAR, _fp32) CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, _fp32) #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (device->integer_dot_product && device->subgroup_clustered) { CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, _fp32_int8) CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_SCALAR, _fp32_int8) CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_SCALAR, _fp32_int8) CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_SCALAR, _fp32_int8) CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_SCALAR, _fp32_int8) CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_SCALAR, _fp32_int8) } else #endif { CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, _fp32) CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_SCALAR, _fp32) CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_SCALAR, _fp32) CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_SCALAR, _fp32) CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_SCALAR, _fp32) CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_SCALAR, _fp32) } } #if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) if (device->coopmat1_fa_support) { CREATE_FA(GGML_TYPE_F32, f32, FA_COOPMAT1, _cm1) CREATE_FA(GGML_TYPE_F16, f16, FA_COOPMAT1, _cm1) CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_COOPMAT1, _cm1) CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_COOPMAT1, _cm1) CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_COOPMAT1, _cm1) CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_COOPMAT1, _cm1) CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_COOPMAT1, _cm1) CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_COOPMAT1, _cm1) } #endif #if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) #define CREATE_FA_CM2_MIXED() \ for (int fa_k_ty = 0; fa_k_ty < (int)GGML_TYPE_COUNT; ++fa_k_ty) { \ for (auto &fa : device->pipeline_flash_attn_f32_f16[fa_k_ty]) { \ FaCodePath path = fa.first.path; \ uint32_t Br = fa.first.Br; \ uint32_t Bc = fa.first.Bc; \ bool aligned = fa.first.aligned; \ bool f32acc = fa.first.f32acc; \ if (path == FA_COOPMAT2) { \ if (aligned) { \ if (f32acc) { \ ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_mixed_aligned_f32acc_cm2", flash_attn_f32_f16_mixed_cm2_len, flash_attn_f32_f16_mixed_cm2_data, "main", 7, sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, get_fa_spec_constants(fa.first), Bc, true, false, 0); \ } else { \ ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_mixed_aligned_f16acc_cm2", flash_attn_f32_f16_mixed_f16acc_cm2_len, flash_attn_f32_f16_mixed_f16acc_cm2_data, "main", 7, sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, get_fa_spec_constants(fa.first), Bc, true, false, 0); \ } \ } else { \ if (f32acc) { \ ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_mixed_f32acc_cm2", flash_attn_f32_f16_mixed_cm2_len, flash_attn_f32_f16_mixed_cm2_data, "main", 7, sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, get_fa_spec_constants(fa.first), 1, true, false, 0); \ } else { \ ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_mixed_f16acc_cm2", flash_attn_f32_f16_mixed_f16acc_cm2_len, flash_attn_f32_f16_mixed_f16acc_cm2_data, "main", 7, sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, get_fa_spec_constants(fa.first), 1, true, false, 0); \ } \ } \ } \ } \ } if (device->coopmat2) { CREATE_FA_CM2_MIXED(); } #undef CREATE_FA_CM2_MIXED #endif #undef CREATE_FA const int mul_mat_id_param_count = 5; #if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) if (device->coopmat2) { // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, true); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, true); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, true); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, true); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, true); \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, true); \ // Create 2 variants, {f16,f32} accumulator #define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3) #if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) if (device->coopmat_bf16_support) { CREATE_MM(pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3) } #endif CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q1_0], matmul_q1_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_0], matmul_q4_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_1], matmul_q4_1_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_0], matmul_q5_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_1], matmul_q5_1_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q8_0], matmul_q8_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q2_K], matmul_q2_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q3_K], matmul_q3_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_K], matmul_q4_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_K], matmul_q5_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q6_K], matmul_q6_k_f16, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ1_S], matmul_iq1_s_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ1_M], matmul_iq1_m_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XXS], matmul_iq2_xxs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_XS], matmul_iq2_xs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ2_S], matmul_iq2_s_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_XXS], matmul_iq3_xxs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ3_S], matmul_iq3_s_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_XS], matmul_iq4_xs_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL], matmul_iq4_nl_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_MXFP4], matmul_mxfp4_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_NVFP4], matmul_nvfp4_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) GGML_ASSERT(device->subgroup_ballot); CREATE_MM2(pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 5) #if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) if (device->coopmat_bf16_support) { CREATE_MM(pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 5) } #endif CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_subgroup_q1_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_subgroup_q5_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_subgroup_q5_1_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_subgroup_q8_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_subgroup_q2_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_subgroup_q3_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_subgroup_q4_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_subgroup_q5_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_subgroup_q6_k_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S], matmul_id_subgroup_iq1_s_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M], matmul_id_subgroup_iq1_m_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS], matmul_id_subgroup_iq2_xxs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS], matmul_id_subgroup_iq2_xs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S], matmul_id_subgroup_iq2_s_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS], matmul_id_subgroup_iq3_xxs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S], matmul_id_subgroup_iq3_s_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS], matmul_id_subgroup_iq4_xs_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_subgroup_iq4_nl_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4], matmul_id_subgroup_mxfp4_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4], matmul_id_subgroup_nvfp4_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) #undef CREATE_MM #undef CREATE_MM2 } else #endif // defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) #if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) if (device->coopmat_support) { // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ if (device->mul_mat ## ID ## _l[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true); \ if (device->mul_mat ## ID ## _m[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true); \ if (device->mul_mat ## ID ## _s[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true); \ if (device->mul_mat ## ID ## _l[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true); \ if (device->mul_mat ## ID ## _m[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true); \ if (device->mul_mat ## ID ## _s[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true); \ // Create 2 variants, {f16,f32} accumulator #define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ if (device->coopmat_acc_f16_support) { \ CREATE_MM(TYPE, PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ } \ if (device->coopmat_acc_f32_support) { \ CREATE_MM(TYPE, PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ } \ CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); #if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) if (device->coopmat_bf16_support) { CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ) } #endif if (device->coopmat_acc_f16_support) { CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0], matmul_q1_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0], matmul_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1], matmul_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0], matmul_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1], matmul_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0], matmul_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K], matmul_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K], matmul_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K], matmul_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K], matmul_q5_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K], matmul_q6_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S], matmul_iq1_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M], matmul_iq1_m_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS], matmul_iq2_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS], matmul_iq2_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S], matmul_iq2_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS], matmul_iq3_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S], matmul_iq3_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS], matmul_iq4_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL], matmul_iq4_nl_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_MXFP4], matmul_mxfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_NVFP4], matmul_nvfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); } else { CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0].f32acc, matmul_q1_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S].f32acc, matmul_iq1_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M].f32acc, matmul_iq1_m_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f32acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f32acc, matmul_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc, matmul_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_MXFP4].f32acc, matmul_mxfp4_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_NVFP4].f32acc, matmul_nvfp4_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); } GGML_ASSERT(device->subgroup_ballot); CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); #if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) if (device->coopmat_bf16_support) { CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); } #endif CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_subgroup_q1_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_subgroup_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_subgroup_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_subgroup_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_subgroup_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_subgroup_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_subgroup_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_subgroup_q5_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_subgroup_q6_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S], matmul_id_subgroup_iq1_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M], matmul_id_subgroup_iq1_m_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS], matmul_id_subgroup_iq2_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS], matmul_id_subgroup_iq2_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S], matmul_id_subgroup_iq2_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS], matmul_id_subgroup_iq3_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S], matmul_id_subgroup_iq3_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS], matmul_id_subgroup_iq4_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_subgroup_iq4_nl_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4], matmul_id_subgroup_mxfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4], matmul_id_subgroup_nvfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); #undef CREATE_MM2 #undef CREATE_MM } else #endif // defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) if (device->fp16) { // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ if (device->mul_mat ## ID ## _l[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _m[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _s[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _l[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _m[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _s[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ #define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ if (device->mul_mat ## ID ## _l[TYPE]) { \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f32acc->l, #NAMELC "_l", NAMELC ## _len, NAMELC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ } \ if (device->mul_mat ## ID ## _m[TYPE]) { \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f32acc->m, #NAMELC "_m", NAMELC ## _len, NAMELC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ } \ if (device->mul_mat ## ID ## _s[TYPE]) { \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME .f32acc->s, #NAMELC "_s", NAMELC ## _len, NAMELC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ } \ // Create 2 variants, {f16,f32} accumulator #define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ CREATE_MM(TYPE, PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ CREATE_MM(TYPE, PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0], matmul_q1_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0], matmul_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1], matmul_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0], matmul_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1], matmul_q5_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0], matmul_q8_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K], matmul_q2_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K], matmul_q3_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K], matmul_q4_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K], matmul_q5_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K], matmul_q6_k_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S], matmul_iq1_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M], matmul_iq1_m_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS], matmul_iq2_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS], matmul_iq2_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S], matmul_iq2_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS], matmul_iq3_xxs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S], matmul_iq3_s_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS], matmul_iq4_xs_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL], matmul_iq4_nl_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_MXFP4], matmul_mxfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_NVFP4], matmul_nvfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (device->integer_dot_product) { CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0], matmul_q4_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1], matmul_q4_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0], matmul_q5_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1], matmul_q5_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0], matmul_q8_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); CREATE_MMQ(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_MXFP4], matmul_mxfp4_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, , 0); CREATE_MMQ(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q2_K], matmul_q2_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, , 0); CREATE_MMQ(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q3_K], matmul_q3_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, , 0); CREATE_MMQ(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_K], matmul_q4_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, , 0); CREATE_MMQ(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_K], matmul_q5_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, , 0); CREATE_MMQ(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q6_K], matmul_q6_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, , 0); } #endif if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) { CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_subgroup_f16, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_subgroup_q1_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_subgroup_q5_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_subgroup_q5_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_subgroup_q8_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_subgroup_q2_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_subgroup_q3_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_subgroup_q4_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_subgroup_q5_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_subgroup_q6_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S], matmul_id_subgroup_iq1_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M], matmul_id_subgroup_iq1_m_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS], matmul_id_subgroup_iq2_xxs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS], matmul_id_subgroup_iq2_xs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S], matmul_id_subgroup_iq2_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS], matmul_id_subgroup_iq3_xxs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S], matmul_id_subgroup_iq3_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS], matmul_id_subgroup_iq4_xs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_subgroup_iq4_nl_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4], matmul_id_subgroup_mxfp4_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4], matmul_id_subgroup_nvfp4_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (device->integer_dot_product) { CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_0], matmul_id_subgroup_q5_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_1], matmul_id_subgroup_q5_1_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q8_0], matmul_id_subgroup_q8_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MMQ(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_MXFP4], matmul_id_subgroup_mxfp4_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MMQ(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q2_K], matmul_id_subgroup_q2_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); CREATE_MMQ(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q3_K], matmul_id_subgroup_q3_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); CREATE_MMQ(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_K], matmul_id_subgroup_q4_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); CREATE_MMQ(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_K], matmul_id_subgroup_q5_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); CREATE_MMQ(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q6_K], matmul_id_subgroup_q6_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); } #endif } else { CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_q1_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S], matmul_id_iq1_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M], matmul_id_iq1_m_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS], matmul_id_iq2_xxs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS], matmul_id_iq2_xs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S], matmul_id_iq2_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS], matmul_id_iq3_xxs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S], matmul_id_iq3_s_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS], matmul_id_iq4_xs_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4], matmul_id_mxfp4_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4], matmul_id_nvfp4_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (device->integer_dot_product) { CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_0], matmul_id_q4_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_1], matmul_id_q4_1_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_0], matmul_id_q5_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_1], matmul_id_q5_1_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q8_0], matmul_id_q8_0_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MMQ(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_MXFP4], matmul_id_mxfp4_q8_1, mmq_wg_denoms, warptile_mmqid_int, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MMQ(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q2_K], matmul_id_q2_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MMQ(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q3_K], matmul_id_q3_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MMQ(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q4_K], matmul_id_q4_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MMQ(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q5_K], matmul_id_q5_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MMQ(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id_q8_1[GGML_TYPE_Q6_K], matmul_id_q6_k_q8_1, mmq_wg_denoms, warptile_mmqid_int_k, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); } #endif } #undef CREATE_MM2 #undef CREATE_MMQ #undef CREATE_MM } else { // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ if (device->mul_mat ## ID ## _l[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _m[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _s[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _l[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _m[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _s[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ #define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ if (device->mul_mat ## ID ## _l[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC "_l", NAMELC ## _fp32_len, NAMELC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ if (device->mul_mat ## ID ## _m[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC "_m", NAMELC ## _fp32_len, NAMELC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ if (device->mul_mat ## ID ## _s[TYPE]) \ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC "_s", NAMELC ## _fp32_len, NAMELC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_F16, pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_F16, pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0].f32acc, matmul_q1_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_S].f32acc, matmul_iq1_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ1_M].f32acc, matmul_iq1_m_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f32acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f32acc, matmul_iq2_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc, matmul_iq2_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc, matmul_iq3_s_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc, matmul_iq4_xs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_MXFP4].f32acc, matmul_mxfp4_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_NVFP4].f32acc, matmul_nvfp4_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (device->integer_dot_product) { CREATE_MMQ(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, ); CREATE_MMQ(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, ); CREATE_MMQ(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, ); CREATE_MMQ(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, ); CREATE_MMQ(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_q8_1, mmq_wg_denoms, warptile_mmq_int, vk_mat_mat_push_constants, 3, ); CREATE_MMQ(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, ); CREATE_MMQ(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, ); CREATE_MMQ(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, ); CREATE_MMQ(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, ); CREATE_MMQ(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_q8_1[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_q8_1, mmq_wg_denoms, warptile_mmq_int_k, vk_mat_mat_push_constants, 3, ); } #endif if (device->subgroup_ballot && device->subgroup_require_full_support && subgroup_min_size_16) { CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_subgroup_f32_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_subgroup_f16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_subgroup_f16_f32, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0].f32acc, matmul_id_subgroup_q1_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_subgroup_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_subgroup_q4_1_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_subgroup_q5_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f32acc, matmul_id_subgroup_q5_1_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f32acc, matmul_id_subgroup_q8_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f32acc, matmul_id_subgroup_q2_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f32acc, matmul_id_subgroup_q3_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f32acc, matmul_id_subgroup_q4_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f32acc, matmul_id_subgroup_q5_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_subgroup_q6_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S].f32acc, matmul_id_subgroup_iq1_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M].f32acc, matmul_id_subgroup_iq1_m_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f32acc, matmul_id_subgroup_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f32acc, matmul_id_subgroup_iq2_xs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f32acc, matmul_id_subgroup_iq2_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f32acc, matmul_id_subgroup_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f32acc, matmul_id_subgroup_iq3_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f32acc, matmul_id_subgroup_iq4_xs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_subgroup_iq4_nl_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4].f32acc, matmul_id_subgroup_mxfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4].f32acc, matmul_id_subgroup_nvfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); } else { CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0].f32acc, matmul_id_q1_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f32acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f32acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f32acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f32acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f32acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f32acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_IQ1_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_S].f32acc, matmul_id_iq1_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_IQ1_M, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ1_M].f32acc, matmul_id_iq1_m_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f32acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f32acc, matmul_id_iq2_xs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f32acc, matmul_id_iq2_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f32acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f32acc, matmul_id_iq3_s_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f32acc, matmul_id_iq4_xs_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_MXFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_MXFP4].f32acc, matmul_id_mxfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_NVFP4].f32acc, matmul_id_nvfp4_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); } } // reusing CREATE_MM from the fp32 path if ((device->coopmat2 || device->coopmat_support) #if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) && !device->coopmat_bf16_support #endif ) { const uint32_t s_warptile_wm = device->subgroup_size == 8 ? 8 : 32; // use scalar tile sizes l_warptile = { 128, 128, 128, 16, subgroup_size_8 * 2, 64, 2, 4, 4, 1, subgroup_size_8 }; m_warptile = { 128, 64, 64, 16, subgroup_size_8, 32, 2, 4, 2, 1, subgroup_size_8 }; s_warptile = { subgroup_size_32, 32, 32, 16, s_warptile_wm, 32, 2, 2, 2, 1, subgroup_size_8 }; l_wg_denoms = {128, 128, 1 }; m_wg_denoms = { 64, 64, 1 }; s_wg_denoms = { 32, 32, 1 }; if (device->vendor_id == VK_VENDOR_ID_INTEL && device->architecture == INTEL_XE2) { // Xe2/Xe3 - bf16 warptile performance tuning l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, 4, 4, 1, subgroup_size_8 }; } CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); } #undef CREATE_MM // mul mat vec // the number of rows computed per shader depends on GPU model and quant uint32_t rm_stdq = 1; uint32_t rm_kq = 2; uint32_t rm_stdq_int = 1; uint32_t rm_kq_int = 1; auto const &rm_iq_int = [](uint32_t i) { return i == 0 ? 8u : 4u; }; if (device->vendor_id == VK_VENDOR_ID_AMD) { if (device->architecture == AMD_GCN) { rm_stdq = 2; rm_kq = 4; rm_stdq_int = 4; } } else if (device->vendor_id == VK_VENDOR_ID_INTEL) { rm_stdq = 2; rm_stdq_int = 2; } uint32_t rm_iq = 2 * rm_kq; const bool use_subgroups = device->subgroup_arithmetic && device->architecture != vk_device_architecture::AMD_GCN; // Ensure a subgroup size >= 16 is available const bool use_subgroups16 = use_subgroups && subgroup_min_size_16; const uint32_t subgroup_size = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control && device->subgroup_min_size <= 16 && device->subgroup_max_size >= 16) ? 16 : device->subgroup_size; const uint32_t subgroup_size16 = std::max(subgroup_size, 16u); const uint32_t force_subgroup_size = use_subgroups ? subgroup_size : 0; const uint32_t force_subgroup_size16 = use_subgroups16 ? subgroup_size16 : 0; static constexpr uint32_t mul_mat_vec_num_bindings = 5; static constexpr uint32_t mul_mat_vec_id_num_bindings = 6; for (uint32_t w = 0; w < DMMV_WG_SIZE_COUNT; ++w) { const uint32_t wg_size_subgroup = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size : (subgroup_size * 4); const uint32_t wg_size_subgroup16 = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size16 : (subgroup_size16 * 4); const shader_reduction_mode reduc = (use_subgroups && w == DMMV_WG_SIZE_SUBGROUP) ? SHADER_REDUCTION_MODE_SUBGROUP : (use_subgroups && w == DMMV_WG_SIZE_LARGE) ? SHADER_REDUCTION_MODE_HYBRID : SHADER_REDUCTION_MODE_SHMEM; const shader_reduction_mode reduc16 = (use_subgroups16 && w == DMMV_WG_SIZE_SUBGROUP) ? SHADER_REDUCTION_MODE_SUBGROUP : (use_subgroups16 && w == DMMV_WG_SIZE_LARGE) ? SHADER_REDUCTION_MODE_HYBRID : SHADER_REDUCTION_MODE_SHMEM; for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32", arr_dmmv_f32_f32_f32_len[reduc], arr_dmmv_f32_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {wg_size_subgroup, 1, i+1}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32", arr_dmmv_f16_f32_f32_len[reduc], arr_dmmv_f16_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f32_f32", arr_dmmv_bf16_f32_f32_len[reduc], arr_dmmv_bf16_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q1_0][i], "mul_mat_vec_q1_0_f32_f32", arr_dmmv_q1_0_f32_f32_len[reduc], arr_dmmv_q1_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32", arr_dmmv_q4_0_f32_f32_len[reduc], arr_dmmv_q4_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f32_f32", arr_dmmv_q4_1_f32_f32_len[reduc], arr_dmmv_q4_1_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f32_f32", arr_dmmv_q5_0_f32_f32_len[reduc], arr_dmmv_q5_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f32_f32", arr_dmmv_q5_1_f32_f32_len[reduc], arr_dmmv_q5_1_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f32_f32", arr_dmmv_q8_0_f32_f32_len[reduc], arr_dmmv_q8_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f32_f32", arr_dmmv_q2_k_f32_f32_len[reduc16], arr_dmmv_q2_k_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f32_f32", arr_dmmv_q3_k_f32_f32_len[reduc16], arr_dmmv_q3_k_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32", arr_dmmv_q4_k_f32_f32_len[reduc16], arr_dmmv_q4_k_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32", arr_dmmv_q5_k_f32_f32_len[reduc16], arr_dmmv_q5_k_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32", arr_dmmv_q6_k_f32_f32_len[reduc16], arr_dmmv_q6_k_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f32_f32", arr_dmmv_iq1_s_f32_f32_len[reduc16], arr_dmmv_iq1_s_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f32_f32", arr_dmmv_iq1_m_f32_f32_len[reduc16], arr_dmmv_iq1_m_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32", arr_dmmv_iq2_xxs_f32_f32_len[reduc16], arr_dmmv_iq2_xxs_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f32_f32", arr_dmmv_iq2_xs_f32_f32_len[reduc16], arr_dmmv_iq2_xs_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f32_f32", arr_dmmv_iq2_s_f32_f32_len[reduc16], arr_dmmv_iq2_s_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32", arr_dmmv_iq3_xxs_f32_f32_len[reduc16], arr_dmmv_iq3_xxs_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f32_f32", arr_dmmv_iq3_s_f32_f32_len[reduc16], arr_dmmv_iq3_s_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f32_f32", arr_dmmv_iq4_xs_f32_f32_len[reduc16], arr_dmmv_iq4_xs_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32", arr_dmmv_iq4_nl_f32_f32_len[reduc16], arr_dmmv_iq4_nl_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f32_f32", arr_dmmv_mxfp4_f32_f32_len[reduc16], arr_dmmv_mxfp4_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_NVFP4][i], "mul_mat_vec_nvfp4_f32_f32", arr_dmmv_nvfp4_f32_f32_len[reduc16], arr_dmmv_nvfp4_f32_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32", arr_dmmv_f32_f16_f32_len[reduc], arr_dmmv_f32_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {wg_size_subgroup, 1, i+1}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32", arr_dmmv_f16_f16_f32_len[reduc], arr_dmmv_f16_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f16_f32", arr_dmmv_bf16_f16_f32_len[reduc], arr_dmmv_bf16_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q1_0][i], "mul_mat_vec_q1_0_f16_f32", arr_dmmv_q1_0_f16_f32_len[reduc], arr_dmmv_q1_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32", arr_dmmv_q4_0_f16_f32_len[reduc], arr_dmmv_q4_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f16_f32", arr_dmmv_q4_1_f16_f32_len[reduc], arr_dmmv_q4_1_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f16_f32", arr_dmmv_q5_0_f16_f32_len[reduc], arr_dmmv_q5_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f16_f32", arr_dmmv_q5_1_f16_f32_len[reduc], arr_dmmv_q5_1_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f16_f32", arr_dmmv_q8_0_f16_f32_len[reduc], arr_dmmv_q8_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f16_f32", arr_dmmv_q2_k_f16_f32_len[reduc16], arr_dmmv_q2_k_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f16_f32", arr_dmmv_q3_k_f16_f32_len[reduc16], arr_dmmv_q3_k_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32", arr_dmmv_q4_k_f16_f32_len[reduc16], arr_dmmv_q4_k_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32", arr_dmmv_q5_k_f16_f32_len[reduc16], arr_dmmv_q5_k_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32", arr_dmmv_q6_k_f16_f32_len[reduc16], arr_dmmv_q6_k_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_f16_f32", arr_dmmv_iq1_s_f16_f32_len[reduc16], arr_dmmv_iq1_s_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_f16_f32", arr_dmmv_iq1_m_f16_f32_len[reduc16], arr_dmmv_iq1_m_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32", arr_dmmv_iq2_xxs_f16_f32_len[reduc16], arr_dmmv_iq2_xxs_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_XS][i], "mul_mat_vec_iq2_xs_f16_f32", arr_dmmv_iq2_xs_f16_f32_len[reduc16], arr_dmmv_iq2_xs_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ2_S][i], "mul_mat_vec_iq2_s_f16_f32", arr_dmmv_iq2_s_f16_f32_len[reduc16], arr_dmmv_iq2_s_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32", arr_dmmv_iq3_xxs_f16_f32_len[reduc16], arr_dmmv_iq3_xxs_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ3_S][i], "mul_mat_vec_iq3_s_f16_f32", arr_dmmv_iq3_s_f16_f32_len[reduc16], arr_dmmv_iq3_s_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_XS][i], "mul_mat_vec_iq4_xs_f16_f32", arr_dmmv_iq4_xs_f16_f32_len[reduc16], arr_dmmv_iq4_xs_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32", arr_dmmv_iq4_nl_f16_f32_len[reduc16], arr_dmmv_iq4_nl_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f16_f32", arr_dmmv_mxfp4_f16_f32_len[reduc16], arr_dmmv_mxfp4_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_NVFP4][i], "mul_mat_vec_nvfp4_f16_f32", arr_dmmv_nvfp4_f16_f32_len[reduc16], arr_dmmv_nvfp4_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (device->integer_dot_product) { const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size; const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_q8_1_f32", arr_dmmv_q4_0_q8_1_f32_len[reduc], arr_dmmv_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_q8_1_f32", arr_dmmv_q4_1_q8_1_f32_len[reduc], arr_dmmv_q4_1_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_q8_1_f32", arr_dmmv_q5_0_q8_1_f32_len[reduc], arr_dmmv_q5_0_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_q8_1_f32", arr_dmmv_q5_1_q8_1_f32_len[reduc], arr_dmmv_q5_1_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_q8_1_f32", arr_dmmv_q8_0_q8_1_f32_len[reduc], arr_dmmv_q8_0_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_q8_1_f32", arr_dmmv_mxfp4_q8_1_f32_len[reduc], arr_dmmv_mxfp4_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_q8_1_f32", arr_dmmv_q2_k_q8_1_f32_len[reduc], arr_dmmv_q2_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_q8_1_f32", arr_dmmv_q3_k_q8_1_f32_len[reduc], arr_dmmv_q3_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_q8_1_f32", arr_dmmv_q4_k_q8_1_f32_len[reduc], arr_dmmv_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_q8_1_f32", arr_dmmv_q5_k_q8_1_f32_len[reduc], arr_dmmv_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_q8_1_f32", arr_dmmv_q6_k_q8_1_f32_len[reduc], arr_dmmv_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_q8_1_f32", arr_dmmv_iq1_s_q8_1_f32_len[reduc], arr_dmmv_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(i), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(i), i+1}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_q8_1_f32", arr_dmmv_iq1_m_q8_1_f32_len[reduc], arr_dmmv_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(i), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(i), i+1}, 1, true, use_subgroups, subgroup_size_int); } #endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT } ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", arr_dmmv_id_f32_f32_f32_len[reduc], arr_dmmv_id_f32_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {wg_size_subgroup, 1}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", arr_dmmv_id_f16_f32_f32_len[reduc], arr_dmmv_id_f16_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {wg_size_subgroup, 2}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_BF16], "mul_mat_vec_id_bf16_f32", arr_dmmv_id_bf16_f32_f32_len[reduc], arr_dmmv_id_bf16_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {wg_size_subgroup, 2}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q1_0], "mul_mat_vec_id_q1_0_f32", arr_dmmv_id_q1_0_f32_f32_len[reduc], arr_dmmv_id_q1_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", arr_dmmv_id_q4_0_f32_f32_len[reduc], arr_dmmv_id_q4_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", arr_dmmv_id_q4_1_f32_f32_len[reduc], arr_dmmv_id_q4_1_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", arr_dmmv_id_q5_0_f32_f32_len[reduc], arr_dmmv_id_q5_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", arr_dmmv_id_q5_1_f32_f32_len[reduc], arr_dmmv_id_q5_1_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", arr_dmmv_id_q8_0_f32_f32_len[reduc], arr_dmmv_id_q8_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq, 1, 1}, {wg_size_subgroup, 1*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", arr_dmmv_id_q2_k_f32_f32_len[reduc16], arr_dmmv_id_q2_k_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", arr_dmmv_id_q3_k_f32_f32_len[reduc16], arr_dmmv_id_q3_k_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", arr_dmmv_id_q4_k_f32_f32_len[reduc16], arr_dmmv_id_q4_k_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", arr_dmmv_id_q5_k_f32_f32_len[reduc16], arr_dmmv_id_q5_k_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", arr_dmmv_id_q6_k_f32_f32_len[reduc16], arr_dmmv_id_q6_k_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {wg_size_subgroup16, rm_kq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_f32", arr_dmmv_id_iq1_s_f32_f32_len[reduc16], arr_dmmv_id_iq1_s_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_f32", arr_dmmv_id_iq1_m_f32_f32_len[reduc16], arr_dmmv_id_iq1_m_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ2_XXS], "mul_mat_vec_id_iq2_xxs_f32", arr_dmmv_id_iq2_xxs_f32_f32_len[reduc16], arr_dmmv_id_iq2_xxs_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ2_XS], "mul_mat_vec_id_iq2_xs_f32", arr_dmmv_id_iq2_xs_f32_f32_len[reduc16], arr_dmmv_id_iq2_xs_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ2_S], "mul_mat_vec_id_iq2_s_f32", arr_dmmv_id_iq2_s_f32_f32_len[reduc16], arr_dmmv_id_iq2_s_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", arr_dmmv_id_iq3_xxs_f32_f32_len[reduc16], arr_dmmv_id_iq3_xxs_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ3_S], "mul_mat_vec_id_iq3_s_f32", arr_dmmv_id_iq3_s_f32_f32_len[reduc16], arr_dmmv_id_iq3_s_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ4_XS], "mul_mat_vec_id_iq4_xs_f32", arr_dmmv_id_iq4_xs_f32_f32_len[reduc16], arr_dmmv_id_iq4_xs_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", arr_dmmv_id_iq4_nl_f32_f32_len[reduc16], arr_dmmv_id_iq4_nl_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_f32", arr_dmmv_id_mxfp4_f32_f32_len[reduc16], arr_dmmv_id_mxfp4_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_NVFP4], "mul_mat_vec_id_nvfp4_f32", arr_dmmv_id_nvfp4_f32_f32_len[reduc16], arr_dmmv_id_nvfp4_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (device->integer_dot_product) { const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size; const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_q8_1_f32", arr_dmmv_id_q4_0_q8_1_f32_len[reduc], arr_dmmv_id_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_q8_1_f32", arr_dmmv_id_q4_1_q8_1_f32_len[reduc], arr_dmmv_id_q4_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_q8_1_f32", arr_dmmv_id_q5_0_q8_1_f32_len[reduc], arr_dmmv_id_q5_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_q8_1_f32", arr_dmmv_id_q5_1_q8_1_f32_len[reduc], arr_dmmv_id_q5_1_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_q8_1_f32", arr_dmmv_id_q8_0_q8_1_f32_len[reduc], arr_dmmv_id_q8_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_q8_1_f32", arr_dmmv_id_mxfp4_q8_1_f32_len[reduc], arr_dmmv_id_mxfp4_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_q8_1_f32", arr_dmmv_id_q2_k_q8_1_f32_len[reduc], arr_dmmv_id_q2_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 2*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_q8_1_f32", arr_dmmv_id_q3_k_q8_1_f32_len[reduc], arr_dmmv_id_q3_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_q8_1_f32", arr_dmmv_id_q4_k_q8_1_f32_len[reduc], arr_dmmv_id_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_q8_1_f32", arr_dmmv_id_q5_k_q8_1_f32_len[reduc], arr_dmmv_id_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_q8_1_f32", arr_dmmv_id_q6_k_q8_1_f32_len[reduc], arr_dmmv_id_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_q8_1_f32", arr_dmmv_id_iq1_s_q8_1_f32_len[reduc], arr_dmmv_id_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_q8_1_f32", arr_dmmv_id_iq1_m_q8_1_f32_len[reduc], arr_dmmv_id_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int); } #endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT } #if !defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) GGML_UNUSED(rm_stdq_int); GGML_UNUSED(rm_kq_int); GGML_UNUSED(rm_iq_int); #endif // dequant shaders ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q1_0], "dequant_q1_0", dequant_q1_0_len, dequant_q1_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 8, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_0], "dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_1], "dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_k", dequant_q2_k_len, dequant_q2_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_k", dequant_q3_k_len, dequant_q3_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ1_S], "dequant_iq1_s", dequant_iq1_s_len, dequant_iq1_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ1_M], "dequant_iq1_m", dequant_iq1_m_len, dequant_iq1_m_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_XXS], "dequant_iq2_xxs", dequant_iq2_xxs_len, dequant_iq2_xxs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_XS], "dequant_iq2_xs", dequant_iq2_xs_len, dequant_iq2_xs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ2_S], "dequant_iq2_s", dequant_iq2_s_len, dequant_iq2_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_XXS], "dequant_iq3_xxs", dequant_iq3_xxs_len, dequant_iq3_xxs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ3_S], "dequant_iq3_s", dequant_iq3_s_len, dequant_iq3_s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_XS], "dequant_iq4_xs", dequant_iq4_xs_len, dequant_iq4_xs_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_MXFP4], "dequant_mxfp4", dequant_mxfp4_len, dequant_mxfp4_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_NVFP4], "dequant_nvfp4", dequant_nvfp4_len, dequant_nvfp4_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); // get_rows ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_BF16], "get_rows_bf16", get_rows_bf16_len, get_rows_bf16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q1_0], "get_rows_q1_0", get_rows_q1_0_len, get_rows_q1_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q2_K], "get_rows_q2_k", get_rows_q2_k_len, get_rows_q2_k_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q3_K], "get_rows_q3_k", get_rows_q3_k_len, get_rows_q3_k_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q4_K], "get_rows_q4_k", get_rows_q4_k_len, get_rows_q4_k_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_K], "get_rows_q5_k", get_rows_q5_k_len, get_rows_q5_k_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q6_K], "get_rows_q6_k", get_rows_q6_k_len, get_rows_q6_k_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ1_S], "get_rows_iq1_s", get_rows_iq1_s_len, get_rows_iq1_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ1_M], "get_rows_iq1_m", get_rows_iq1_m_len, get_rows_iq1_m_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_XXS], "get_rows_iq2_xxs", get_rows_iq2_xxs_len, get_rows_iq2_xxs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_XS], "get_rows_iq2_xs", get_rows_iq2_xs_len, get_rows_iq2_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ2_S], "get_rows_iq2_s", get_rows_iq2_s_len, get_rows_iq2_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs", get_rows_iq3_xxs_len, get_rows_iq3_xxs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ3_S], "get_rows_iq3_s", get_rows_iq3_s_len, get_rows_iq3_s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs", get_rows_iq4_xs_len, get_rows_iq4_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4], "get_rows_mxfp4", get_rows_mxfp4_len, get_rows_mxfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_NVFP4], "get_rows_nvfp4", get_rows_nvfp4_len, get_rows_nvfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_I32], "get_rows_i32", get_rows_i32_len, get_rows_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_BF16], "get_rows_bf16_f32", get_rows_bf16_f32_len, get_rows_bf16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q1_0], "get_rows_q1_0_f32", get_rows_q1_0_f32_len, get_rows_q1_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q2_K], "get_rows_q2_k_f32", get_rows_q2_k_f32_len, get_rows_q2_k_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q3_K], "get_rows_q3_k_f32", get_rows_q3_k_f32_len, get_rows_q3_k_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q4_K], "get_rows_q4_k_f32", get_rows_q4_k_f32_len, get_rows_q4_k_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_K], "get_rows_q5_k_f32", get_rows_q5_k_f32_len, get_rows_q5_k_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q6_K], "get_rows_q6_k_f32", get_rows_q6_k_f32_len, get_rows_q6_k_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ1_S], "get_rows_iq1_s_f32", get_rows_iq1_s_f32_len, get_rows_iq1_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ1_M], "get_rows_iq1_m_f32", get_rows_iq1_m_f32_len, get_rows_iq1_m_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_XXS], "get_rows_iq2_xxs_f32", get_rows_iq2_xxs_f32_len, get_rows_iq2_xxs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_XS], "get_rows_iq2_xs_f32", get_rows_iq2_xs_f32_len, get_rows_iq2_xs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ2_S], "get_rows_iq2_s_f32", get_rows_iq2_s_f32_len, get_rows_iq2_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_XXS], "get_rows_iq3_xxs_f32", get_rows_iq3_xxs_f32_len, get_rows_iq3_xxs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ3_S], "get_rows_iq3_s_f32", get_rows_iq3_s_f32_len, get_rows_iq3_s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs_f32", get_rows_iq4_xs_f32_len, get_rows_iq4_xs_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_MXFP4], "get_rows_mxfp4_f32", get_rows_mxfp4_f32_len, get_rows_mxfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_NVFP4], "get_rows_nvfp4_f32", get_rows_nvfp4_f32_len, get_rows_nvfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, sizeof(vk_op_flash_attn_split_k_reduce_push_constants), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true); for (auto &it : device->pipeline_fa_mask_opt) { auto BrBc = it.first; ggml_vk_create_pipeline(device, it.second, "fa_mask_opt", fa_mask_opt_len, fa_mask_opt_data, "main", 2, sizeof(vk_op_flash_attn_mask_opt_push_constants), {1, 1, 1}, {128, 128 / device->subgroup_size, BrBc.first, BrBc.second}, 1, true, true, device->subgroup_size); } if (device->subgroup_clustered && device->subgroup_require_full_support) { ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true); } else { ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_len, quantize_q8_1_x4_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1); } for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) { if (device->subgroup_arithmetic && device->subgroup_require_full_support) { ggml_vk_create_pipeline2(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_subgroup_add_len, mul_mat_vec_p021_f16_f32_subgroup_add_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_p021_push_constants), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true, true); } else { ggml_vk_create_pipeline2(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_p021_push_constants), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true); } } ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_nc_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_f32, "rms_norm_mul_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_rms_norm_partials_f32, "rms_norm_partials_f32", rms_norm_partials_f32_len, rms_norm_partials_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_partials_f32, "rms_norm_mul_partials_f32", rms_norm_partials_f32_len, rms_norm_partials_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 1}, 1, true); if (sizeof(vk_op_rms_norm_mul_rope_push_constants) <= device->properties.limits.maxPushConstantsSize) { ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_rope_f32_f32, "rms_norm_mul_rope_f32_f32", rms_norm_mul_rope_f32_f32_len, rms_norm_mul_rope_f32_f32_data, "main", 7, sizeof(vk_op_rms_norm_mul_rope_push_constants), {1, 1, 1}, {0, 1}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_rope_f32_f16, "rms_norm_mul_rope_f32_f16", rms_norm_mul_rope_f32_f16_len, rms_norm_mul_rope_f32_f16_data, "main", 7, sizeof(vk_op_rms_norm_mul_rope_push_constants), {1, 1, 1}, {0, 1}, 1, true); } ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f32, "cpy_f16_f32", cpy_f16_f32_len, cpy_f16_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_bf16,"cpy_f32_bf16",cpy_f32_bf16_len,cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_i32_f32, "cpy_i32_f32", cpy_i32_f32_len, cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_i32, "cpy_f32_i32", cpy_f32_i32_len, cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f32, "contig_cpy_f16_f32", contig_cpy_f16_f32_len, contig_cpy_f16_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_bf16,"contig_cpy_f32_bf16",contig_cpy_f32_bf16_len,contig_cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_i32_f32, "contig_cpy_i32_f32", contig_cpy_i32_f32_len, contig_cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_i32, "contig_cpy_f32_i32", contig_cpy_f32_i32_len, contig_cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_32, "cpy_transpose_32", cpy_transpose_32_len, cpy_transpose_32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_16, "cpy_transpose_16", cpy_transpose_16_len, cpy_transpose_16_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q1_0], "cpy_f32_q1_0", cpy_f32_q1_0_len, cpy_f32_q1_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); #define SET_ROWS(itype) \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F32], "set_rows_f32" #itype, set_rows_f32 ## itype ## _len, set_rows_f32 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F16], "set_rows_f16" #itype, set_rows_f16 ## itype ## _len, set_rows_f16 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_BF16], "set_rows_bf16" #itype, set_rows_bf16 ## itype ## _len, set_rows_bf16 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q1_0], "set_rows_q1_0" #itype, set_rows_q1_0 ## itype ## _len, set_rows_q1_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_0], "set_rows_q4_0" #itype, set_rows_q4_0 ## itype ## _len, set_rows_q4_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_1], "set_rows_q4_1" #itype, set_rows_q4_1 ## itype ## _len, set_rows_q4_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_0], "set_rows_q5_0" #itype, set_rows_q5_0 ## itype ## _len, set_rows_q5_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_1], "set_rows_q5_1" #itype, set_rows_q5_1 ## itype ## _len, set_rows_q5_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q8_0], "set_rows_q8_0" #itype, set_rows_q8_0 ## itype ## _len, set_rows_q8_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_IQ4_NL], "set_rows_iq4_nl" #itype, set_rows_iq4_nl ## itype ## _len, set_rows_iq4_nl ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); SET_ROWS(_i32) SET_ROWS(_i64) #undef SET_ROWS ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q1_0], "cpy_q1_0_f32", cpy_q1_0_f32_len, cpy_q1_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q1_0), 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_1], "cpy_q4_1_f32", cpy_q4_1_f32_len, cpy_q4_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_0], "cpy_q5_0_f32", cpy_q5_0_f32_len, cpy_q5_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_1], "cpy_q5_1_f32", cpy_q5_1_f32_len, cpy_q5_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q8_0], "cpy_q8_0_f32", cpy_q8_0_f32_len, cpy_q8_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_IQ4_NL], "cpy_iq4_nl_f32", cpy_iq4_nl_f32_len, cpy_iq4_nl_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1); auto get_suffix = [](bool src0_f16, bool src1_f16, bool dst_f16) { std::string s; s += std::string(src0_f16 ? "_f16" : "_f32"); s += std::string(src1_f16 ? "_f16" : "_f32"); s += std::string(dst_f16 ? "_f16" : "_f32"); return s; }; #define CREATE_BINARY(name, namemod, spec, bindings) \ for (int s0 : {0,1}) for (int s1 : {0,1}) for (int d : {0,1}) \ ggml_vk_create_pipeline2(device, device->pipeline_ ## name ## namemod[s0][s1][d], \ #name + get_suffix(s0, s1, d) + #namemod, name ## _len[s0][s1][d], name ## _data[s0][s1][d], \ "main", (bindings), sizeof(vk_op_binary_push_constants), {512, 1, 1}, spec, 1); CREATE_BINARY(add, , {0}, 4) CREATE_BINARY(add, _norepeat, {1}, 4) CREATE_BINARY(sub, , {0}, 3) CREATE_BINARY(sub, _norepeat, {1}, 3) CREATE_BINARY(mul, , {0}, 3) CREATE_BINARY(mul, _norepeat, {1}, 3) CREATE_BINARY(div, , {0}, 3) CREATE_BINARY(div, _norepeat, {1}, 3) CREATE_BINARY(add_rms, , {0}, 4) CREATE_BINARY(add_rms, _norepeat, {1}, 4) #undef CREATE_BINARY if (device->multi_add) { for (uint32_t i = 0; i < MAX_FUSED_ADDS; ++i) { ggml_vk_create_pipeline2(device, device->pipeline_multi_add[i], "multi_add_f32_" + std::to_string(i+1), multi_add_f32_len, multi_add_f32_data, "main", MAX_PARAMETER_COUNT, sizeof(vk_op_multi_add_push_constants), {512, 1, 1}, {i+2}, 1); ggml_vk_create_pipeline2(device, device->pipeline_multi_add_rms[i], "multi_add_rms_f32_" + std::to_string(i+1), multi_add_rms_f32_len, multi_add_rms_f32_data, "main", MAX_PARAMETER_COUNT, sizeof(vk_op_multi_add_push_constants), {512, 1, 1}, {i+2}, 1); } } ggml_vk_create_pipeline(device, device->pipeline_add_id_f32, "add_id_f32", add_id_f32_len, add_id_f32_data, "main", 4, sizeof(vk_op_add_id_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0, 1}, 1); ggml_vk_create_pipeline(device, device->pipeline_set_f32, "set_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0, 0}, 1); ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1); ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1); ggml_vk_create_pipeline(device, device->pipeline_upscale_bicubic_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BICUBIC}, 1); ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_antialias_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS}, 1); ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_sqrt_f32, "sqrt_f32", sqrt_f32_len, sqrt_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_log[0], "log_f32", log_f32_len, log_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_log[1], "log_f16", log_f16_len, log_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_tri[0], "tri_f32", tri_f32_len, tri_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_tri[1], "tri_f16", tri_f16_len, tri_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_diag[0], "diag_f32", diag_f32_len, diag_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_diag[1], "diag_f16", diag_f16_len, diag_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); #define CREATE_UNARY(name) \ ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \ ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); CREATE_UNARY(elu) CREATE_UNARY(gelu) CREATE_UNARY(gelu_erf) CREATE_UNARY(gelu_quick) CREATE_UNARY(silu) CREATE_UNARY(relu) CREATE_UNARY(xielu) CREATE_UNARY(neg) CREATE_UNARY(tanh) CREATE_UNARY(sigmoid) CREATE_UNARY(hardsigmoid) CREATE_UNARY(hardswish) CREATE_UNARY(abs) CREATE_UNARY(softplus) CREATE_UNARY(step) CREATE_UNARY(round) CREATE_UNARY(ceil) CREATE_UNARY(floor) CREATE_UNARY(trunc) CREATE_UNARY(sgn) CREATE_UNARY(exp) #undef CREATE_UNARY ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f16, "add1_f16_f16", add1_f16_f16_len, add1_f16_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f32, "add1_f16_f32", add1_f16_f32_len, add1_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_add1_f32_f32, "add1_f32_f32", add1_f32_f32_len, add1_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_arange_f32, "arange_f32", arange_f32_len, arange_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_fill_f32, "fill_f32", fill_f32_len, fill_f32_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_fill_f16, "fill_f16", fill_f16_len, fill_f16_data, "main", 1, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); #define CREATE_GLU(name) \ ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); CREATE_GLU(geglu) CREATE_GLU(reglu) CREATE_GLU(swiglu) CREATE_GLU(swiglu_oai) CREATE_GLU(geglu_erf) CREATE_GLU(geglu_quick) #undef CREATE_GLU ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_silu_back_f32, "silu_back_f32", silu_back_f32_len, silu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {1, 512, 1}, {}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_wg512, "soft_max_f32_wg512", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true); ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32, "soft_max_large1_f32", soft_max_large1_f32_len, soft_max_large1_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32, "soft_max_large2_f32", soft_max_large2_f32_len, soft_max_large2_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32, "soft_max_large3_f32", soft_max_large3_f32_len, soft_max_large3_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32_f16, "soft_max_large1_f32_f16", soft_max_large1_f32_f16_len, soft_max_large1_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32_f16, "soft_max_large2_f32_f16", soft_max_large2_f32_f16_len, soft_max_large2_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32_f16, "soft_max_large3_f32_f16", soft_max_large3_f32_f16_len, soft_max_large3_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f32, "rope_vision_f32", rope_vision_f32_len, rope_vision_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f16, "rope_multi_f16", rope_multi_f16_len, rope_multi_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f16, "rope_vision_f16", rope_vision_f16_len, rope_vision_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32_f16, "rope_norm_f32_f16", rope_norm_f32_f16_len, rope_norm_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32_f16, "rope_neox_f32_f16", rope_neox_f32_f16_len, rope_neox_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32_f16, "rope_multi_f32_f16", rope_multi_f32_f16_len, rope_multi_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); for (uint32_t i = 0; i < num_argsort_pipelines; ++i) { uint32_t BLOCK_SIZE = 1u << std::min(i, device->max_workgroup_size_log2); if (i <= device->max_workgroup_size_log2 && 2 * sizeof(int) * BLOCK_SIZE <= device->properties.limits.maxComputeSharedMemorySize) { const uint32_t NCOLS_PADDED_LOG2 = i; ggml_vk_create_pipeline2(device, device->pipeline_argsort_f32[i], "argsort_f32_"+std::to_string(i), argsort_f32_len, argsort_f32_data, "main", 3, sizeof(vk_op_argsort_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, NCOLS_PADDED_LOG2}, 1, true); } const uint32_t WG_UNROLL_FACTOR = BLOCK_SIZE > 1 ? 2 : 1; BLOCK_SIZE /= WG_UNROLL_FACTOR; ggml_vk_create_pipeline2(device, device->pipeline_argsort_large_f32[i], "argsort_large_f32_"+std::to_string(i), argsort_large_f32_len, argsort_large_f32_data, "main", 3, sizeof(vk_op_argsort_push_constants), {BLOCK_SIZE * WG_UNROLL_FACTOR, 1, 1}, {BLOCK_SIZE, WG_UNROLL_FACTOR}, 1, true); } for (uint32_t i = 0; i < num_topk_pipelines; ++i) { const uint32_t BLOCK_SIZE = 1u << i; const uint32_t NCOLS_PADDED_LOG2 = i; if (i <= device->max_workgroup_size_log2) { uint32_t nary_shmem = 2 * sizeof(int) * BLOCK_SIZE + sizeof(int) * device->subgroup_size + 2 * sizeof(int) + 2 * (BLOCK_SIZE / device->subgroup_size) * sizeof(int); if (device->subgroup_arithmetic && device->subgroup_require_full_support && device->subgroup_shuffle && device->subgroup_ballot && nary_shmem <= device->properties.limits.maxComputeSharedMemorySize) { ggml_vk_create_pipeline2(device, device->pipeline_topk_f32[i], "topk_f32_"+std::to_string(i), topk_nary_search_f32_len, topk_nary_search_f32_data, "main", 2, sizeof(vk_op_topk_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, device->subgroup_size, device->subgroup_size_log2}, 1, true, true, device->subgroup_size); } else if (2 * sizeof(int) * BLOCK_SIZE <= device->properties.limits.maxComputeSharedMemorySize) { ggml_vk_create_pipeline2(device, device->pipeline_topk_f32[i], "topk_f32_"+std::to_string(i), topk_argsort_f32_len, topk_argsort_f32_data, "main", 2, sizeof(vk_op_topk_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, NCOLS_PADDED_LOG2}, 1, true); } } } ggml_vk_create_pipeline(device, device->pipeline_argmax_f32, "argmax_f32", argmax_f32_len, argmax_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); const uint32_t cumsum_elem_per_thread = (device->vendor_id == VK_VENDOR_ID_AMD || device->vendor_id == VK_VENDOR_ID_INTEL) ? 2 : 4; ggml_vk_create_pipeline(device, device->pipeline_cumsum_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 256, device->subgroup_size, cumsum_elem_per_thread }, 1, true, true, device->subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_cumsum_small_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 128, device->subgroup_size, 1 }, 1, true, true, device->subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass1_f32, "cumsum_multipass1_f32", cumsum_multipass1_f32_len, cumsum_multipass1_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, device->subgroup_size }, 1, true, true, device->subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass2_f32, "cumsum_multipass2_f32", cumsum_multipass2_f32_len, cumsum_multipass2_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, device->subgroup_size }, 1, true, true, device->subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_count_equal_i32, "count_equal_i32", count_equal_i32_len, count_equal_i32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_count_experts, "count_experts", count_experts_len, count_experts_data, "main", 2, sizeof(vk_op_count_experts_push_constants), {1, 1, 1}, {}, 1, true); for (auto &s : device->pipeline_solve_tri_f32) { const vk_solve_tri_pipeline_state &state = s.first; // Max number of rows to load at a time, limited by shared memory const uint32_t batch_N = device->properties.limits.maxComputeSharedMemorySize / ((state.N + state.K) * sizeof(float)); // Need at least K invocations, and prefer a minimum of 128 to spread out loading shared memory const uint32_t block_size = std::max(128u, 1u << (uint32_t)ceilf(log2f(float(state.K)))); ggml_vk_create_pipeline( device, s.second, "solve_tri_f32", solve_tri_f32_len, solve_tri_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, { 0, state.N, state.K, batch_N, block_size }, 1, true); } #define IM2COL(bda) \ ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32 ## bda ## _len, im2col_f32 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32, "im2col_3d_f32", im2col_3d_f32 ## bda ## _len, im2col_3d_f32 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16 ## bda ## _len, im2col_f32_f16 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32_f16, "im2col_3d_f32_f16", im2col_3d_f32_f16 ## bda ## _len, im2col_3d_f32_f16 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true); if (device->shader_int64 && device->buffer_device_address) { IM2COL(_bda) } else { IM2COL() } ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); { const uint32_t gdn_sizes[] = {32, 64, 128}; const char * gdn_names[][2] = { {"gated_delta_net_f32_d32", "gated_delta_net_f32_d32_kda"}, {"gated_delta_net_f32_d64", "gated_delta_net_f32_d64_kda"}, {"gated_delta_net_f32_d128", "gated_delta_net_f32_d128_kda"}, }; const bool use_subgroup_reduce = device->subgroup_arithmetic; for (uint32_t si = 0; si < 3; si++) { const uint32_t S_V = gdn_sizes[si]; GGML_ASSERT(is_pow2(S_V)); uint32_t lanes_per_column; if (S_V >= 128u && device->subgroup_clustered) { lanes_per_column = 8u; } else { // Use largest power-of-two that divides both S_V and subgroup_size so that // (1) S_V % lanes_per_column == 0 and (2) S_V % (subgroup_size / lanes_per_column) == 0. // This means we don't need extra bounds checking logic in the shader. lanes_per_column = std::min(S_V, device->subgroup_size); } const bool need_clustered_shader = lanes_per_column != 1 && (lanes_per_column < device->subgroup_size); size_t gdn_len; const void * gdn_data; if (use_subgroup_reduce && need_clustered_shader) { gdn_len = gated_delta_net_f32_len; gdn_data = (const void *)gated_delta_net_f32_data; } else if (use_subgroup_reduce) { gdn_len = gated_delta_net_f32_nocluster_len; gdn_data = (const void *)gated_delta_net_f32_nocluster_data; } else { gdn_len = gated_delta_net_f32_shmem_len; gdn_data = (const void *)gated_delta_net_f32_shmem_data; } const uint32_t cols_per_wg = device->subgroup_size / lanes_per_column; const std::array wg_denoms = {1u, 1u, cols_per_wg}; for (uint32_t kda = 0; kda < 2; kda++) { ggml_vk_create_pipeline(device, device->pipeline_gated_delta_net[si][kda], gdn_names[si][kda], gdn_len, gdn_data, "main", 7, sizeof(vk_op_gated_delta_net_push_constants), wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_reduce, device->subgroup_size); } } } if (device->subgroup_arithmetic && device->subgroup_require_full_support) { ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size}, 1, true, true); ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size}, 1, true, true); } else { ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1, true, true); ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true); } ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_f32, "ssm_conv_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 3, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16}, 1); ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_opt_step_sgd_f32, "opt_step_sgd_f32", opt_step_sgd_f32_len, opt_step_sgd_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); // conv2d, conv_transpose_2d for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) { uint32_t conv2d_WG_SIZE = 256; uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices. uint32_t conv2d_TS_K = (s == CONV_SHAPE_64x32) ? 4 : 8; uint32_t conv2d_SHMEM_PAD = 4; vk_conv_block_size conv2d_BS = vk_conv_block_sizes[s]; bool conv2d_UNROLL = true; #if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) if (device->coopmat2) { conv2d_SHMEM_PAD = 8; // 8 float16_t } #endif if (device->vendor_id == VK_VENDOR_ID_INTEL) { conv2d_SHMEM_PAD = 0; conv2d_UNROLL = false; } else if (device->vendor_id == VK_VENDOR_ID_AMD) { conv2d_SHMEM_PAD = device->architecture == vk_device_architecture::AMD_GCN ? 1 : 4; if (s == CONV_SHAPE_128x128 && device->architecture != vk_device_architecture::AMD_GCN) { conv2d_UNROLL = false; } } // Use collectives on pre-Turing NVIDIA GPUs and GCN AMD cards, which had slower integer math. bool allow_collectives_nv = device->vendor_id != VK_VENDOR_ID_NVIDIA || device->architecture == vk_device_architecture::NVIDIA_PRE_TURING; bool allow_collectives_amd = device->vendor_id != VK_VENDOR_ID_AMD || device->architecture == vk_device_architecture::AMD_GCN; if (device->subgroup_shuffle && device->vendor_id != VK_VENDOR_ID_INTEL && // Do not enable collectives on Intel, see PR 14316. allow_collectives_nv && allow_collectives_amd) { use_collectives = 1; conv2d_BS.CRS = std::min( device->subgroup_size, conv2d_BS.CRS); // CRS block size should be capped at subgroup size for correctness when shuffle is used. } uint32_t conv2d_shmem_req = (conv2d_BS.K * (conv2d_BS.CRS + conv2d_SHMEM_PAD) + conv2d_BS.CRS * (conv2d_BS.NPQ + conv2d_SHMEM_PAD)) * sizeof(float); if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) { conv2d_BS.CRS = 8; if (use_collectives) { conv2d_BS.CRS = std::min(device->subgroup_size, conv2d_BS.CRS); } } std::array wg_denoms = { conv2d_BS.K, 1, 1 }; std::vector spec_constants = { conv2d_WG_SIZE, conv2d_BS.K, conv2d_BS.CRS, conv2d_BS.NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD }; #define CREATE_CONV(name, type_suffix, spv_suffix) \ for (auto &c : device->pipeline_##name##type_suffix[s]) { \ const vk_conv2d_pipeline_state &state = c.first; \ std::vector spec_constants_cpy = spec_constants; \ spec_constants_cpy.push_back(state.s0); \ spec_constants_cpy.push_back(state.s1); \ spec_constants_cpy.push_back(state.p0); \ spec_constants_cpy.push_back(state.p1); \ spec_constants_cpy.push_back(state.d0); \ spec_constants_cpy.push_back(state.d1); \ spec_constants_cpy.push_back(state.KW); \ spec_constants_cpy.push_back(state.KH); \ ggml_vk_create_pipeline( \ device, c.second, #name #type_suffix, \ name##type_suffix##spv_suffix##_len, name##type_suffix##spv_suffix##_data, "main", 3, \ sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives); \ } #define CREATE_CONVS(spv_suffix) \ CREATE_CONV(conv2d, _f32, spv_suffix) \ CREATE_CONV(conv2d, _f16_f32, spv_suffix) \ CREATE_CONV(conv_transpose_2d, _f32, spv_suffix) \ CREATE_CONV(conv_transpose_2d, _f16_f32, spv_suffix) #if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) if (device->coopmat2) { CREATE_CONVS(_cm2) } else #endif if (conv2d_UNROLL) { CREATE_CONVS(_unroll) } else { CREATE_CONVS( ) } #undef CREATE_CONV #undef CREATE_CONVS } ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); for (uint32_t use_push = 0; use_push < 2; ++use_push) { for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) { ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][use_push], "topk_moe_f32_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 4, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); } } for (auto &c : compiles) { c.wait(); } } static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch); static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev); static vk_device ggml_vk_get_device(size_t idx) { VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")"); if (vk_instance.devices[idx] == nullptr) { VK_LOG_DEBUG("Initializing new vk_device"); vk_device device = std::make_shared(); vk_instance.devices[idx] = device; device->memory_logger = std::unique_ptr(new vk_memory_logger()); size_t dev_num = vk_instance.device_indices[idx]; std::vector physical_devices = vk_instance.instance.enumeratePhysicalDevices(); if (dev_num >= physical_devices.size()) { std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl; throw std::runtime_error("Device not found"); } device->physical_device = physical_devices[dev_num]; const std::vector ext_props = device->physical_device.enumerateDeviceExtensionProperties(); device->architecture = get_device_architecture(device->physical_device); const char* GGML_VK_PREFER_HOST_MEMORY = getenv("GGML_VK_PREFER_HOST_MEMORY"); device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr; const char* GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM = getenv("GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM"); device->disable_host_visible_vidmem = GGML_VK_DISABLE_HOST_VISIBLE_VIDMEM != nullptr; const char* GGML_VK_ALLOW_SYSMEM_FALLBACK = getenv("GGML_VK_ALLOW_SYSMEM_FALLBACK"); device->allow_sysmem_fallback = GGML_VK_ALLOW_SYSMEM_FALLBACK != nullptr; const char* GGML_VK_DISABLE_GRAPH_OPTIMIZE = getenv("GGML_VK_DISABLE_GRAPH_OPTIMIZE"); device->disable_graph_optimize = GGML_VK_DISABLE_GRAPH_OPTIMIZE != nullptr; bool fp16_storage = false; bool fp16_compute = false; bool maintenance4_support = false; bool sm_builtins = false; bool amd_shader_core_properties2 = false; bool pipeline_robustness = false; bool coopmat2_support = false; bool pipeline_executable_properties_support = false; device->coopmat_support = false; device->integer_dot_product = false; device->shader_64b_indexing = false; bool bfloat16_support = false; for (const auto& properties : ext_props) { if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { maintenance4_support = true; } else if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { fp16_storage = true; } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { fp16_compute = true; } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) { sm_builtins = true; } else if (strcmp("VK_AMD_shader_core_properties2", properties.extensionName) == 0) { amd_shader_core_properties2 = true; } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) { pipeline_robustness = true; } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { device->subgroup_size_control = true; #if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_COOPMAT")) { device->coopmat_support = true; device->coopmat_m = 0; device->coopmat_n = 0; device->coopmat_k = 0; #endif #if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_COOPMAT2")) { coopmat2_support = true; #endif #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) { device->integer_dot_product = true; #endif #if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) } else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_BFLOAT16")) { bfloat16_support = true; #endif } else if (strcmp("VK_KHR_pipeline_executable_properties", properties.extensionName) == 0) { pipeline_executable_properties_support = true; } else if (strcmp("VK_EXT_memory_priority", properties.extensionName) == 0 && getenv("GGML_VK_ENABLE_MEMORY_PRIORITY")) { device->memory_priority = true; } else if (strcmp("VK_EXT_external_memory_host", properties.extensionName) == 0) { device->external_memory_host = true; #if defined(VK_EXT_shader_64bit_indexing) } else if (strcmp("VK_EXT_shader_64bit_indexing", properties.extensionName) == 0) { device->shader_64b_indexing = true; #endif } } vk::PhysicalDeviceProperties2 props2; vk::PhysicalDeviceMaintenance3Properties props3; vk::PhysicalDeviceMaintenance4Properties props4; vk::PhysicalDeviceSubgroupProperties subgroup_props; vk::PhysicalDeviceDriverProperties driver_props; vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props; vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props; vk::PhysicalDeviceVulkan11Properties vk11_props; vk::PhysicalDeviceVulkan12Properties vk12_props; vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR shader_integer_dot_product_props; vk::PhysicalDeviceExternalMemoryHostPropertiesEXT external_memory_host_props; props2.pNext = &props3; props3.pNext = &subgroup_props; subgroup_props.pNext = &driver_props; driver_props.pNext = &vk11_props; vk11_props.pNext = &vk12_props; VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_props; if (maintenance4_support) { last_struct->pNext = (VkBaseOutStructure *)&props4; last_struct = (VkBaseOutStructure *)&props4; } if (sm_builtins) { last_struct->pNext = (VkBaseOutStructure *)&sm_props; last_struct = (VkBaseOutStructure *)&sm_props; } if (amd_shader_core_properties2) { last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props; last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props; } if (device->subgroup_size_control) { last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_props; last_struct = (VkBaseOutStructure *)&subgroup_size_control_props; } #if defined(VK_NV_cooperative_matrix2) vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props; if (coopmat2_support) { last_struct->pNext = (VkBaseOutStructure *)&coopmat2_props; last_struct = (VkBaseOutStructure *)&coopmat2_props; } #endif if (device->integer_dot_product) { last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_props; last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_props; } if (device->external_memory_host) { last_struct->pNext = (VkBaseOutStructure *)&external_memory_host_props; last_struct = (VkBaseOutStructure *)&external_memory_host_props; } device->physical_device.getProperties2(&props2); device->properties = props2.properties; device->vendor_id = device->properties.vendorID; device->driver_id = driver_props.driverID; if (device->driver_id == vk::DriverId::eMoltenvk) { // Disable external_memory_host until https://github.com/KhronosGroup/MoltenVK/pull/2622 // is available in the Vulkan SDK. device->external_memory_host = false; } // Implementing the async backend interfaces seems broken on older Intel HW, // see https://github.com/ggml-org/llama.cpp/issues/17302. device->support_async = (device->vendor_id != VK_VENDOR_ID_INTEL || std::string(device->properties.deviceName.data()).find("(DG1)") == std::string::npos) && getenv("GGML_VK_DISABLE_ASYNC") == nullptr; if (!device->support_async) { GGML_LOG_DEBUG("ggml_vulkan: WARNING: Async execution disabled on certain Intel devices.\n"); } const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE"); if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) { device->max_memory_allocation_size = std::stoull(GGML_VK_FORCE_MAX_ALLOCATION_SIZE); } else if (maintenance4_support) { device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize); } else { device->max_memory_allocation_size = props3.maxMemoryAllocationSize; } const char* GGML_VK_FORCE_MAX_BUFFER_SIZE = getenv("GGML_VK_FORCE_MAX_BUFFER_SIZE"); if (GGML_VK_FORCE_MAX_BUFFER_SIZE != nullptr) { device->max_buffer_size = std::stoull(GGML_VK_FORCE_MAX_BUFFER_SIZE); } else if (maintenance4_support) { device->max_buffer_size = props4.maxBufferSize; } else { device->max_buffer_size = device->max_memory_allocation_size; } const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE"); if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) { device->suballocation_block_size = std::stoull(GGML_VK_SUBALLOCATION_BLOCK_SIZE); } else { // Limit batching of allocations to 1GB by default to avoid fragmentation issues device->suballocation_block_size = 1024*1024*1024; } device->suballocation_block_size = std::min(device->suballocation_block_size, device->max_memory_allocation_size); device->subgroup_size = subgroup_props.subgroupSize; device->subgroup_size_log2 = uint32_t(log2f(float(device->subgroup_size))); device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; if (sm_builtins) { device->shader_core_count = sm_props.shaderSMCount; } else if (amd_shader_core_properties2) { device->shader_core_count = amd_shader_core_properties2_props.activeComputeUnitCount; } else if (device->vendor_id == VK_VENDOR_ID_INTEL) { device->shader_core_count = ggml_vk_intel_shader_core_count(device->physical_device); } else { device->shader_core_count = 0; } device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16; device->subgroup_basic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eBasic); device->subgroup_arithmetic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic); #ifdef __APPLE__ // Workaround for subgroup arithmetic failing on MoltenVK with AMD GPUs (issue 15846) if (device->vendor_id == VK_VENDOR_ID_AMD) { device->subgroup_arithmetic = false; } #endif device->subgroup_shuffle = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eShuffle); device->subgroup_clustered = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eClustered); device->subgroup_ballot = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eBallot); device->subgroup_vote = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eVote); const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr; device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute; if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props, device->architecture)) { device->coopmat_support = false; } device->integer_dot_product = device->integer_dot_product && shader_integer_dot_product_props.integerDotProduct4x8BitPackedSignedAccelerated; device->min_imported_host_pointer_alignment = external_memory_host_props.minImportedHostPointerAlignment; device->max_workgroup_size_log2 = uint32_t(log2f(float(device->properties.limits.maxComputeWorkGroupInvocations))); std::vector queue_family_props = device->physical_device.getQueueFamilyProperties(); // Try to find a non-graphics compute queue and transfer-focused queues // Allow overriding avoiding the graphics queue because it can increase performance on RADV const bool allow_graphics_queue = (getenv("GGML_VK_ALLOW_GRAPHICS_QUEUE") != nullptr); const vk::QueueFlagBits graphics_flag = allow_graphics_queue ? (vk::QueueFlagBits)0 : vk::QueueFlagBits::eGraphics; const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, graphics_flag, -1, 1); const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | graphics_flag, compute_queue_family_index, 1); const float priorities[] = { 1.0f, 1.0f }; device->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1; std::vector device_queue_create_infos; if (compute_queue_family_index != transfer_queue_family_index) { device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, priorities + 1}); } else if(!device->single_queue) { device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities}); } else { device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); } vk::DeviceCreateInfo device_create_info{}; std::vector device_extensions; vk::PhysicalDeviceFeatures device_features = device->physical_device.getFeatures(); VkPhysicalDeviceFeatures2 device_features2; device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; device_features2.pNext = nullptr; device_features2.features = (VkPhysicalDeviceFeatures)device_features; VkPhysicalDeviceVulkan11Features vk11_features; vk11_features.pNext = nullptr; vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; device_features2.pNext = &vk11_features; VkPhysicalDeviceVulkan12Features vk12_features; vk12_features.pNext = nullptr; vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; vk11_features.pNext = &vk12_features; last_struct = (VkBaseOutStructure *)&vk12_features; VkPhysicalDevicePipelineRobustnessFeaturesEXT pl_robustness_features; pl_robustness_features.pNext = nullptr; pl_robustness_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_FEATURES_EXT; pl_robustness_features.pipelineRobustness = VK_FALSE; if (pipeline_robustness) { last_struct->pNext = (VkBaseOutStructure *)&pl_robustness_features; last_struct = (VkBaseOutStructure *)&pl_robustness_features; device_extensions.push_back("VK_EXT_pipeline_robustness"); } VkPhysicalDeviceMemoryPriorityFeaturesEXT memory_priority_features; memory_priority_features.pNext = nullptr; memory_priority_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PRIORITY_FEATURES_EXT; memory_priority_features.memoryPriority = VK_FALSE; if (device->memory_priority) { last_struct->pNext = (VkBaseOutStructure *)&memory_priority_features; last_struct = (VkBaseOutStructure *)&memory_priority_features; device_extensions.push_back("VK_EXT_memory_priority"); } VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features; subgroup_size_control_features.pNext = nullptr; subgroup_size_control_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT; subgroup_size_control_features.computeFullSubgroups = false; subgroup_size_control_features.subgroupSizeControl = false; if (device->subgroup_size_control) { last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_features; last_struct = (VkBaseOutStructure *)&subgroup_size_control_features; } #if defined(VK_KHR_cooperative_matrix) VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features; coopmat_features.pNext = nullptr; coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; coopmat_features.cooperativeMatrix = VK_FALSE; if (device->coopmat_support) { last_struct->pNext = (VkBaseOutStructure *)&coopmat_features; last_struct = (VkBaseOutStructure *)&coopmat_features; } #endif #if defined(VK_NV_cooperative_matrix2) VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {}; coopmat2_features.pNext = nullptr; coopmat2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV; if (coopmat2_support) { last_struct->pNext = (VkBaseOutStructure *)&coopmat2_features; last_struct = (VkBaseOutStructure *)&coopmat2_features; device_extensions.push_back("VK_NV_cooperative_matrix2"); } #endif #if defined(VK_KHR_shader_bfloat16) VkPhysicalDeviceShaderBfloat16FeaturesKHR bfloat16_features {}; bfloat16_features.pNext = nullptr; bfloat16_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR; if (bfloat16_support) { last_struct->pNext = (VkBaseOutStructure *)&bfloat16_features; last_struct = (VkBaseOutStructure *)&bfloat16_features; device_extensions.push_back("VK_KHR_shader_bfloat16"); } #endif VkPhysicalDeviceMaintenance4Features maint4_features {}; maint4_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_FEATURES; if (maintenance4_support) { last_struct->pNext = (VkBaseOutStructure *)&maint4_features; last_struct = (VkBaseOutStructure *)&maint4_features; device_extensions.push_back("VK_KHR_maintenance4"); } VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR shader_integer_dot_product_features {}; shader_integer_dot_product_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR; if (device->integer_dot_product) { last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_features; last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_features; device_extensions.push_back("VK_KHR_shader_integer_dot_product"); } VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR pep_features {}; pep_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR; if (pipeline_executable_properties_support) { last_struct->pNext = (VkBaseOutStructure *)&pep_features; last_struct = (VkBaseOutStructure *)&pep_features; device_extensions.push_back("VK_KHR_pipeline_executable_properties"); } if (device->external_memory_host) { device_extensions.push_back("VK_EXT_external_memory_host"); } #if defined(VK_EXT_shader_64bit_indexing) VkPhysicalDeviceShader64BitIndexingFeaturesEXT shader_64bit_indexing_features {}; shader_64bit_indexing_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_64_BIT_INDEXING_FEATURES_EXT; if (device->shader_64b_indexing) { last_struct->pNext = (VkBaseOutStructure *)&shader_64bit_indexing_features; last_struct = (VkBaseOutStructure *)&shader_64bit_indexing_features; device_extensions.push_back("VK_EXT_shader_64bit_indexing"); } #endif vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2); device->pipeline_executable_properties_support = pipeline_executable_properties_support; device->fp16 = device->fp16 && vk12_features.shaderFloat16; #if defined(VK_KHR_shader_bfloat16) device->bf16 = bfloat16_support && bfloat16_features.shaderBFloat16Type; #else device->bf16 = false; #endif device->pipeline_robustness = pl_robustness_features.pipelineRobustness; device->multi_add = vk12_props.shaderRoundingModeRTEFloat16 && device->properties.limits.maxPushConstantsSize >= sizeof(vk_op_multi_add_push_constants) && getenv("GGML_VK_DISABLE_MULTI_ADD") == nullptr; device->shader_int64 = device_features2.features.shaderInt64; device->buffer_device_address = vk12_features.bufferDeviceAddress; device->vulkan_memory_model = vk12_features.vulkanMemoryModel; if (device->subgroup_size_control) { device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize; device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize; device_extensions.push_back("VK_EXT_subgroup_size_control"); } device->subgroup_size_control = device->subgroup_size_control && (subgroup_size_control_props.requiredSubgroupSizeStages & vk::ShaderStageFlagBits::eCompute) && subgroup_size_control_features.subgroupSizeControl; device->subgroup_require_full_support = subgroup_size_control_features.computeFullSubgroups; #if defined(VK_KHR_cooperative_matrix) device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix; device->coopmat1_fa_support = device->coopmat_support && device->subgroup_require_full_support; #endif if (coopmat2_support) { #if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) if (coopmat2_features.cooperativeMatrixWorkgroupScope && coopmat2_features.cooperativeMatrixFlexibleDimensions && coopmat2_features.cooperativeMatrixReductions && coopmat2_features.cooperativeMatrixConversions && coopmat2_features.cooperativeMatrixPerElementOperations && coopmat2_features.cooperativeMatrixTensorAddressing && coopmat2_features.cooperativeMatrixBlockLoads && vk12_features.bufferDeviceAddress) { std::vector flexible_dimensions; uint32_t count = 0; PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV = (PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV) vk_instance.instance.getProcAddr("vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV"); _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(device->physical_device, &count, nullptr); VkCooperativeMatrixFlexibleDimensionsPropertiesNV empty_prop {}; empty_prop.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_FLEXIBLE_DIMENSIONS_PROPERTIES_NV; flexible_dimensions.resize(count, empty_prop); _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(device->physical_device, &count, flexible_dimensions.data()); bool found_fp16_128 = false, found_fp16_256 = false, found_fp32_128 = false, found_fp32_256 = false; // need to support fp16*fp16 with fp16/fp32 accumulator, for workgroupsize 128 // with 32x16x16 and 256 with 32x32x16. for (auto &prop : flexible_dimensions) { if (prop.saturatingAccumulation == VK_FALSE && prop.scope == VK_SCOPE_WORKGROUP_KHR && prop.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && prop.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) { if (prop.workgroupInvocations == 128 && prop.MGranularity <= 32 && prop.NGranularity <= 16 && prop.KGranularity <= 16) { if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { found_fp16_128 = true; } if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { found_fp32_128 = true; } } if (prop.workgroupInvocations == 256 && prop.MGranularity <= 32 && prop.NGranularity <= 32 && prop.KGranularity <= 16) { if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { found_fp16_256 = true; } if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { found_fp32_256 = true; } } } } if (found_fp16_128 && found_fp16_256 && found_fp32_128 && found_fp32_256 && coopmat2_props.cooperativeMatrixFlexibleDimensionsMaxDimension >= 512) { device->coopmat2 = true; } } #endif } if (!vk11_features.storageBuffer16BitAccess) { std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl; throw std::runtime_error("Unsupported device"); } device_extensions.push_back("VK_KHR_16bit_storage"); #ifdef GGML_VULKAN_VALIDATE device_extensions.push_back("VK_KHR_shader_non_semantic_info"); #endif if (device->fp16) { device_extensions.push_back("VK_KHR_shader_float16_int8"); } #if defined(VK_KHR_cooperative_matrix) if (device->coopmat_support) { // Query supported shapes std::vector cm_props; PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)vkGetInstanceProcAddr(vk_instance.instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR"); uint32_t cm_props_num; pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(device->physical_device, &cm_props_num, nullptr); cm_props.resize(cm_props_num); for (auto& prop : cm_props) { prop.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR; } pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(device->physical_device, &cm_props_num, cm_props.data()); VK_LOG_DEBUG("ggml_vulkan: Cooperative Matrix Shapes: " << cm_props.size()); for (auto& prop : cm_props) { VK_LOG_DEBUG("ggml_vulkan: M: " << prop.MSize << " N: " << prop.NSize << " K: " << prop.KSize << " A: " << vk::to_string((vk::ComponentTypeKHR)prop.AType) << " B: " << vk::to_string((vk::ComponentTypeKHR)prop.BType) << " C: " << vk::to_string((vk::ComponentTypeKHR)prop.CType) << " Result: " << vk::to_string((vk::ComponentTypeKHR)prop.ResultType) << " saturatingAccumulation: " << prop.saturatingAccumulation << " scope: " << vk::to_string((vk::ScopeKHR)prop.scope)); if ((vk::ComponentTypeKHR)prop.AType == vk::ComponentTypeKHR::eFloat16 && (vk::ComponentTypeKHR)prop.BType == vk::ComponentTypeKHR::eFloat16 && (vk::ScopeKHR)prop.scope == vk::ScopeKHR::eSubgroup ) { if ((vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eFloat32 && (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eFloat32) { // coopmat sizes not set yet if (device->coopmat_m == 0) { device->coopmat_acc_f32_support = true; device->coopmat_m = prop.MSize; device->coopmat_n = prop.NSize; device->coopmat_k = prop.KSize; } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) { // Only enable if shape is identical device->coopmat_acc_f32_support = true; } if (prop.MSize == 16 && prop.NSize == 16 && prop.KSize == 16) { device->coopmat_support_16x16x16_f32acc = true; } } else if ((vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eFloat16 && (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eFloat16) { // coopmat sizes not set yet if (device->coopmat_m == 0) { device->coopmat_acc_f16_support = true; device->coopmat_m = prop.MSize; device->coopmat_n = prop.NSize; device->coopmat_k = prop.KSize; } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) { // Only enable if shape is identical device->coopmat_acc_f16_support = true; } if (prop.MSize == 16 && prop.NSize == 16 && prop.KSize == 16) { device->coopmat_support_16x16x16_f16acc = true; } } } else if ((vk::ComponentTypeKHR)prop.AType == vk::ComponentTypeKHR::eSint8 && (vk::ComponentTypeKHR)prop.BType == vk::ComponentTypeKHR::eSint8 && (vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eSint32 && (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eSint32 && (vk::ScopeKHR)prop.scope == vk::ScopeKHR::eSubgroup && device->coopmat_int_m == 0 ) { device->coopmat_int_support = true; device->coopmat_int_m = prop.MSize; device->coopmat_int_n = prop.NSize; device->coopmat_int_k = prop.KSize; } #if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) if (prop.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && prop.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR && prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR && (vk::ScopeKHR)prop.scope == vk::ScopeKHR::eSubgroup ) { // coopmat sizes not set yet if (device->coopmat_m == 0) { device->coopmat_bf16_support = true; device->coopmat_m = prop.MSize; device->coopmat_n = prop.NSize; device->coopmat_k = prop.KSize; } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) { // Only enable if shape is identical device->coopmat_bf16_support = true; } } #endif } if (device->coopmat_m == 0 || !device->coopmat_acc_f32_support) { // No suitable matmul mode found GGML_LOG_DEBUG("ggml_vulkan: WARNING: No suitable matrix core mode found. Disabling matrix cores.\n"); device->coopmat_support = false; } if (getenv("GGML_VK_DISABLE_BFLOAT16")) { device->coopmat_bf16_support = false; } } if (device->coopmat_support) { device_extensions.push_back("VK_KHR_cooperative_matrix"); } #if defined(VK_KHR_shader_bfloat16) if (device->coopmat_bf16_support) { device_extensions.push_back("VK_KHR_shader_bfloat16"); } #endif #endif device->name = GGML_VK_NAME + std::to_string(idx); device_create_info .setFlags(vk::DeviceCreateFlags()) .setQueueCreateInfos(device_queue_create_infos) .setPEnabledExtensionNames(device_extensions); device_create_info.setPNext(&device_features2); device->device = device->physical_device.createDevice(device_create_info); // Queues ggml_vk_create_queue(device, device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }, false); // Shaders // Disable matmul tile sizes early if performance low or not supported for (uint32_t i = 0; i < GGML_TYPE_COUNT; ++i) { switch (device->vendor_id) { #ifndef GGML_VULKAN_RUN_TESTS case VK_VENDOR_ID_AMD: device->mul_mat_l[i] = device->coopmat_support && device->driver_id != vk::DriverId::eAmdProprietary; device->mul_mat_m[i] = true; device->mul_mat_s[i] = true; device->mul_mat_id_l[i] = false; device->mul_mat_id_m[i] = true; device->mul_mat_id_s[i] = true; break; case VK_VENDOR_ID_INTEL: if (!device->coopmat_support || device->architecture != INTEL_XE2) { device->mul_mat_l[i] = false; device->mul_mat_id_l[i] = false; } else { device->mul_mat_l[i] = true; // if coopmat & XE2+, allow large matmul warptile config for Intel device->mul_mat_id_l[i] = true; } device->mul_mat_m[i] = true; device->mul_mat_s[i] = true; device->mul_mat_id_m[i] = true; device->mul_mat_id_s[i] = true; break; case VK_VENDOR_ID_APPLE: device->mul_mat_l[i] = false; device->mul_mat_m[i] = true; device->mul_mat_s[i] = false; device->mul_mat_id_l[i] = false; device->mul_mat_id_m[i] = true; device->mul_mat_id_s[i] = false; break; #endif default: device->mul_mat_l[i] = true; device->mul_mat_m[i] = true; device->mul_mat_s[i] = true; device->mul_mat_id_l[i] = true; device->mul_mat_id_m[i] = true; device->mul_mat_id_s[i] = true; break; } } std::vector dsl_binding; std::vector dsl_binding_flags; for (uint32_t i = 0; i < MAX_PARAMETER_COUNT; i++) { dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute}); dsl_binding_flags.push_back({}); } vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags }; vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info( {}, dsl_binding); descriptor_set_layout_create_info.setPNext(&dslbfci); device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info); ggml_vk_load_shaders(device); // Only use transfer queue on AMD non-GCN, when the graphics queue is not enabled const bool prefers_transfer_queue = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN && !allow_graphics_queue; if (!device->single_queue) { const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true); device->async_use_transfer_queue = prefers_transfer_queue || (getenv("GGML_VK_ASYNC_USE_TRANSFER_QUEUE") != nullptr); } else { // TODO: Use pointer or reference to avoid copy device->transfer_queue.copyFrom(device->compute_queue); device->transfer_queue.cmd_pool.init(device, &device->transfer_queue); device->async_use_transfer_queue = false; } device->buffer_type = { /* .iface = */ ggml_backend_vk_buffer_type_interface, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), idx), /* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device }, }; device->fence = device->device.createFence({}); device->idx = idx; device->disable_fusion = getenv("GGML_VK_DISABLE_FUSION") != nullptr; device->add_rms_fusion = !device->disable_fusion && device->subgroup_arithmetic && device->vendor_id != VK_VENDOR_ID_INTEL; device->partials_binding_alignment = std::max(4u, (uint32_t)device->properties.limits.minStorageBufferOffsetAlignment); device->mmvq_mode = 0; if (getenv("GGML_VK_DISABLE_MMVQ")) { device->mmvq_mode = -1; } else if (getenv("GGML_VK_FORCE_MMVQ")) { device->mmvq_mode = 1; } return device; } return vk_instance.devices[idx]; } static void ggml_vk_print_gpu_info(size_t idx) { GGML_ASSERT(idx < vk_instance.device_indices.size()); size_t dev_num = vk_instance.device_indices[idx]; VK_LOG_DEBUG("ggml_vk_print_gpu_info(" << dev_num << ")"); GGML_ASSERT(vk_instance_initialized); std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); if (dev_num >= devices.size()) { std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl; throw std::runtime_error("Device not found"); } vk::PhysicalDevice physical_device = devices[dev_num]; std::vector ext_props = physical_device.enumerateDeviceExtensionProperties(); bool fp16_storage = false; bool fp16_compute = false; bool coopmat_support = false; bool coopmat2_support = false; bool integer_dot_product = false; bool bfloat16_support = false; for (auto properties : ext_props) { if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { fp16_storage = true; } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { fp16_compute = true; #if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_COOPMAT")) { coopmat_support = true; #endif #if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_COOPMAT2")) { coopmat2_support = true; #endif #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) { integer_dot_product = true; #endif #if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) } else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_BFLOAT16")) { bfloat16_support = true; #endif } } const vk_device_architecture device_architecture = get_device_architecture(physical_device); const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16"); bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr; bool fp16 = !force_disable_f16 && fp16_storage && fp16_compute; vk::PhysicalDeviceProperties2 props2; vk::PhysicalDeviceMaintenance3Properties props3; vk::PhysicalDeviceSubgroupProperties subgroup_props; vk::PhysicalDeviceDriverProperties driver_props; vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR shader_integer_dot_product_props; props2.pNext = &props3; props3.pNext = &subgroup_props; subgroup_props.pNext = &driver_props; // Pointer to the last chain element VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&driver_props; if (integer_dot_product) { last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_props; last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_props; } physical_device.getProperties2(&props2); VkPhysicalDeviceFeatures2 device_features2; device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; device_features2.pNext = nullptr; VkPhysicalDeviceVulkan11Features vk11_features; vk11_features.pNext = nullptr; vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; device_features2.pNext = &vk11_features; VkPhysicalDeviceVulkan12Features vk12_features; vk12_features.pNext = nullptr; vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; vk11_features.pNext = &vk12_features; // Pointer to the last chain element last_struct = (VkBaseOutStructure *)&vk12_features; #if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features; coopmat_features.pNext = nullptr; coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; coopmat_features.cooperativeMatrix = VK_FALSE; if (coopmat_support) { last_struct->pNext = (VkBaseOutStructure *)&coopmat_features; last_struct = (VkBaseOutStructure *)&coopmat_features; } #endif VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR shader_integer_dot_product_features {}; shader_integer_dot_product_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR; if (integer_dot_product) { last_struct->pNext = (VkBaseOutStructure *)&shader_integer_dot_product_features; last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_features; } #if defined(VK_KHR_shader_bfloat16) VkPhysicalDeviceShaderBfloat16FeaturesKHR bfloat16_features {}; bfloat16_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR; if (bfloat16_support) { last_struct->pNext = (VkBaseOutStructure *)&bfloat16_features; last_struct = (VkBaseOutStructure *)&bfloat16_features; } #endif vkGetPhysicalDeviceFeatures2(physical_device, &device_features2); fp16 = fp16 && vk12_features.shaderFloat16; #if defined(VK_KHR_shader_bfloat16) bool bf16 = bfloat16_support && bfloat16_features.shaderBFloat16Type; #else bool bf16 = false; #endif uint32_t default_subgroup_size = get_subgroup_size("", device_architecture); const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize; const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; integer_dot_product = integer_dot_product && shader_integer_dot_product_props.integerDotProduct4x8BitPackedSignedAccelerated && shader_integer_dot_product_features.shaderIntegerDotProduct; coopmat_support = coopmat_support #if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) && coopmat_features.cooperativeMatrix #endif && ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture); std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none"; std::string device_name = props2.properties.deviceName.data(); GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | bf16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n", idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, bf16, subgroup_size, props2.properties.limits.maxComputeSharedMemorySize, integer_dot_product, matrix_cores.c_str()); if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n"); } } static bool ggml_vk_instance_layer_settings_available(); static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector& instance_extensions); static bool ggml_vk_instance_debug_utils_ext_available(const std::vector & instance_extensions); static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev); static DispatchLoaderDynamic ggml_vk_default_dispatcher_instance; DispatchLoaderDynamic & ggml_vk_default_dispatcher() { return ggml_vk_default_dispatcher_instance; } static void ggml_vk_instance_init() { if (vk_instance_initialized) { return; } VK_LOG_DEBUG("ggml_vk_instance_init()"); // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers- ggml_vk_default_dispatcher_instance.init(vkGetInstanceProcAddr); uint32_t api_version = vk::enumerateInstanceVersion(); if (api_version < VK_API_VERSION_1_2) { std::cerr << "ggml_vulkan: Error: Vulkan 1.2 required." << std::endl; throw vk::SystemError(vk::Result::eErrorFeatureNotPresent, "Vulkan 1.2 required"); } vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, api_version }; const std::vector instance_extensions = vk::enumerateInstanceExtensionProperties(); const bool layer_settings = ggml_vk_instance_layer_settings_available(); #ifdef __APPLE__ const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions); #endif const bool debug_utils_ext = ggml_vk_instance_debug_utils_ext_available(instance_extensions) && getenv("GGML_VK_DEBUG_MARKERS") != nullptr; std::vector layers; if (layer_settings) { layers.push_back("VK_LAYER_KHRONOS_validation"); } std::vector extensions; if (layer_settings) { extensions.push_back("VK_EXT_layer_settings"); } #ifdef __APPLE__ if (portability_enumeration_ext) { extensions.push_back("VK_KHR_portability_enumeration"); } #endif if (debug_utils_ext) { extensions.push_back("VK_EXT_debug_utils"); } VkBool32 enable_best_practice = layer_settings; std::vector settings = { { "VK_LAYER_KHRONOS_validation", "validate_best_practices", vk::LayerSettingTypeEXT::eBool32, 1, &enable_best_practice }, }; vk::LayerSettingsCreateInfoEXT layer_setting_info(settings); vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions, &layer_setting_info); #ifdef __APPLE__ if (portability_enumeration_ext) { instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR; } #endif vk_instance.instance = vk::createInstance(instance_create_info); vk_instance_initialized = true; if (debug_utils_ext) { vk_instance.debug_utils_support = true; vk_instance.pfn_vkSetDebugUtilsObjectNameEXT = (PFN_vkSetDebugUtilsObjectNameEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkSetDebugUtilsObjectNameEXT"); vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT = (PFN_vkQueueBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueBeginDebugUtilsLabelEXT"); vk_instance.pfn_vkQueueEndDebugUtilsLabelEXT = (PFN_vkQueueEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueEndDebugUtilsLabelEXT"); vk_instance.pfn_vkCmdBeginDebugUtilsLabelEXT = (PFN_vkCmdBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdBeginDebugUtilsLabelEXT"); vk_instance.pfn_vkCmdEndDebugUtilsLabelEXT = (PFN_vkCmdEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdEndDebugUtilsLabelEXT"); vk_instance.pfn_vkCmdInsertDebugUtilsLabelEXT = (PFN_vkCmdInsertDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdInsertDebugUtilsLabelEXT"); } vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr; vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr; vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr; vk_memory_logger_enabled = getenv("GGML_VK_MEMORY_LOGGER") != nullptr; const char* GGML_VK_PIPELINE_STATS = getenv("GGML_VK_PIPELINE_STATS"); if (GGML_VK_PIPELINE_STATS != nullptr) { vk_pipeline_stats_filter = GGML_VK_PIPELINE_STATS; } const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY"); if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) { vk_perf_logger_frequency = std::stoul(GGML_VK_PERF_LOGGER_FREQUENCY); } // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers- VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance); std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES"); if (devices_env != nullptr) { size_t num_available_devices = devices.size(); std::string devices(devices_env); std::replace(devices.begin(), devices.end(), ',', ' '); std::stringstream ss(devices); size_t tmp; while (ss >> tmp) { if(tmp >= num_available_devices) { std::cerr << "ggml_vulkan: Invalid device index " << tmp << " in GGML_VK_VISIBLE_DEVICES." << std::endl; throw std::runtime_error("Invalid Vulkan device index"); } vk_instance.device_indices.push_back(tmp); } } else { // If no vulkan devices are found, return early if (devices.empty()) { GGML_LOG_INFO("ggml_vulkan: No devices found.\n"); return; } // Default to using all dedicated GPUs for (size_t i = 0; i < devices.size(); i++) { vk::PhysicalDeviceProperties2 new_props; vk::PhysicalDeviceDriverProperties new_driver; vk::PhysicalDeviceIDProperties new_id; new_props.pNext = &new_driver; new_driver.pNext = &new_id; devices[i].getProperties2(&new_props); if ((new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) && ggml_vk_device_is_supported(devices[i])) { // Check if there are two physical devices corresponding to the same GPU // This handles the case where the same GPU appears with different drivers (e.g., RADV + AMDVLK on Linux), // see https://github.com/ggml-org/llama.cpp/pull/7582 for original deduplication. // MoltenVK on macOS may report the same UUID for distinct GPUs on multi-GPU cards, // see https://github.com/KhronosGroup/MoltenVK/issues/2683. Skip when both old/new // driver is MoltenVK auto old_device = std::find_if( vk_instance.device_indices.begin(), vk_instance.device_indices.end(), [&devices, &new_id, &new_driver](const size_t k){ vk::PhysicalDeviceProperties2 old_props; vk::PhysicalDeviceDriverProperties old_driver; vk::PhysicalDeviceIDProperties old_id; old_props.pNext = &old_driver; old_driver.pNext = &old_id; devices[k].getProperties2(&old_props); bool same_uuid = std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID)); same_uuid = same_uuid || ( old_id.deviceLUIDValid && new_id.deviceLUIDValid && std::equal(std::begin(old_id.deviceLUID), std::end(old_id.deviceLUID), std::begin(new_id.deviceLUID)) ); bool both_molten_vk = (new_driver.driverID == vk::DriverId::eMoltenvk && old_driver.driverID == vk::DriverId::eMoltenvk); return same_uuid && !both_molten_vk; } ); if (old_device == vk_instance.device_indices.end()) { vk_instance.device_indices.push_back(i); } else { // There can be two physical devices corresponding to the same GPU if there are 2 different drivers // This can cause error when splitting layers aross the devices, need to keep only 1 VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same deviceUUID"); vk::PhysicalDeviceProperties2 old_props; vk::PhysicalDeviceDriverProperties old_driver; old_props.pNext = &old_driver; devices[*old_device].getProperties2(&old_props); std::map driver_priorities {}; int old_priority = std::numeric_limits::max(); int new_priority = std::numeric_limits::max(); // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id // Smaller number -> higher priority switch (old_props.properties.vendorID) { case VK_VENDOR_ID_AMD: driver_priorities[vk::DriverId::eMesaRadv] = 1; driver_priorities[vk::DriverId::eAmdOpenSource] = 2; driver_priorities[vk::DriverId::eAmdProprietary] = 3; break; case VK_VENDOR_ID_INTEL: driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1; driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2; break; case VK_VENDOR_ID_NVIDIA: driver_priorities[vk::DriverId::eNvidiaProprietary] = 1; #if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235 driver_priorities[vk::DriverId::eMesaNvk] = 2; #endif break; case VK_VENDOR_ID_QUALCOMM: driver_priorities[vk::DriverId::eQualcommProprietary] = 1; driver_priorities[vk::DriverId::eMesaTurnip] = 2; break; } driver_priorities[vk::DriverId::eMesaDozen] = 100; if (driver_priorities.count(old_driver.driverID)) { old_priority = driver_priorities[old_driver.driverID]; } if (driver_priorities.count(new_driver.driverID)) { new_priority = driver_priorities[new_driver.driverID]; } if (new_priority < old_priority) { auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device); vk_instance.device_indices.erase(r, vk_instance.device_indices.end()); vk_instance.device_indices.push_back(i); VK_LOG_DEBUG("Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName); } else { VK_LOG_DEBUG("Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl); } } } } // If no GPUs found, fall back to the first non-CPU device. // If only CPU devices are available, return without devices. if (vk_instance.device_indices.empty()) { for (size_t i = 0; i < devices.size(); i++) { if (devices[i].getProperties().deviceType != vk::PhysicalDeviceType::eCpu) { vk_instance.device_indices.push_back(i); break; } } } if (vk_instance.device_indices.empty()) { GGML_LOG_INFO("ggml_vulkan: No devices found.\n"); return; } } GGML_LOG_DEBUG("ggml_vulkan: Found %zu Vulkan devices:\n", vk_instance.device_indices.size()); for (size_t i = 0; i < vk_instance.device_indices.size(); i++) { vk::PhysicalDevice vkdev = devices[vk_instance.device_indices[i]]; std::vector extensionprops = vkdev.enumerateDeviceExtensionProperties(); bool membudget_supported = false; for (const auto & ext : extensionprops) { if (strcmp(VK_EXT_MEMORY_BUDGET_EXTENSION_NAME, ext.extensionName) == 0) { membudget_supported = true; break; } } vk_instance.device_supports_membudget.push_back(membudget_supported); ggml_vk_print_gpu_info(i); } } static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")"); ggml_vk_instance_init(); GGML_ASSERT(idx < vk_instance.device_indices.size()); ctx->name = GGML_VK_NAME + std::to_string(idx); ctx->device = ggml_vk_get_device(idx); ctx->semaphore_idx = 0; ctx->event_idx = 0; ctx->prealloc_size_x = 0; ctx->prealloc_size_y = 0; ctx->prealloc_size_split_k = 0; // Fixed size of 1KB, for deterministic behavior ctx->prealloc_size_add_rms_partials = 1024; ctx->fence = ctx->device->device.createFence({}); ctx->almost_ready_fence = ctx->device->device.createFence({}); ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue); if (ctx->device->async_use_transfer_queue) { vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; vk::SemaphoreCreateInfo ci{}; ci.setPNext(&tci); ctx->transfer_semaphore.s = ctx->device->device.createSemaphore(ci); ctx->transfer_semaphore.value = 0; ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue); } if (vk_perf_logger_enabled) { ctx->perf_logger = std::unique_ptr(new vk_perf_logger()); } #ifdef GGML_VULKAN_CHECK_RESULTS const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS"); vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks)); const char* output_tensor = getenv("GGML_VULKAN_OUTPUT_TENSOR"); vk_output_tensor = (output_tensor == NULL ? 0 : atoi(output_tensor)); #endif } static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) { VK_LOG_DEBUG("ggml_vk_get_to_fp16()"); switch (type) { case GGML_TYPE_F32: case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_NL: case GGML_TYPE_MXFP4: case GGML_TYPE_NVFP4: break; default: return nullptr; } return ctx->device->pipeline_dequant[type]; } static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) { VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ", " << prec << ")"); if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_f32; } if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { return ctx->device->pipeline_matmul_f32_f16; } if (src0_type == GGML_TYPE_BF16 && src1_type == GGML_TYPE_BF16) { return ctx->device->pipeline_matmul_bf16; } if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) { if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_f16_f32.f16acc; } if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { return ctx->device->pipeline_matmul_f16.f16acc; } } else { if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_f16_f32.f32acc; } if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { return ctx->device->pipeline_matmul_f16.f32acc; } } // MMQ if (src1_type == GGML_TYPE_Q8_1) { vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f32acc; if (pipelines->is_empty()) { return nullptr; } return pipelines; } if (src1_type != GGML_TYPE_F32 && !ctx->device->coopmat2) { return nullptr; } switch (src0_type) { case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_NL: case GGML_TYPE_MXFP4: case GGML_TYPE_NVFP4: break; default: return nullptr; } if (ctx->device->coopmat2) { assert(src1_type == GGML_TYPE_F16); return prec == GGML_PREC_DEFAULT ? ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f32acc; } if (ctx->device->coopmat_support) { return (ctx->device->fp16 && ctx->device->coopmat_acc_f16_support && prec == GGML_PREC_DEFAULT) ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc; } return (ctx->device->fp16 && prec == GGML_PREC_DEFAULT) ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc; } static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type, uint32_t num_cols, uint32_t m, uint32_t k) { VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()"); GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16 || b_type == GGML_TYPE_Q8_1); GGML_ASSERT(num_cols >= 1 && num_cols <= mul_mat_vec_max_cols); if (b_type == GGML_TYPE_Q8_1) { switch (a_type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_MXFP4: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_M: break; default: return nullptr; } } switch (a_type) { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_NL: case GGML_TYPE_MXFP4: case GGML_TYPE_NVFP4: break; default: return nullptr; } // heuristic to choose workgroup size uint32_t dmmv_wg = DMMV_WG_SIZE_SUBGROUP; if ((ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA && ctx->device->architecture != vk_device_architecture::NVIDIA_PRE_TURING) || ctx->device->vendor_id == VK_VENDOR_ID_INTEL) { // Prefer larger workgroups when M is small, to spread the work out more // and keep more SMs busy. // q6_k seems to prefer small workgroup size even for "medium" values of M. if (a_type == GGML_TYPE_Q6_K) { if (m < 4096 && k >= 1024) { dmmv_wg = DMMV_WG_SIZE_LARGE; } } else { if (m <= 8192 && k >= 1024) { dmmv_wg = DMMV_WG_SIZE_LARGE; } } } if (b_type == GGML_TYPE_Q8_1) { if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL) { dmmv_wg = DMMV_WG_SIZE_SUBGROUP; } return ctx->device->pipeline_dequant_mul_mat_vec_q8_1_f32[dmmv_wg][a_type][num_cols-1]; } return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[dmmv_wg][a_type][num_cols-1] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[dmmv_wg][a_type][num_cols-1]; } static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) { VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()"); if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_id_f32; } if (src0_type == GGML_TYPE_BF16 && src1_type == GGML_TYPE_BF16) { return ctx->device->pipeline_matmul_id_bf16; } if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) { if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_id_f16_f32.f16acc; } if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { return ctx->device->pipeline_matmul_id_f16.f16acc; } } else { if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_id_f16_f32.f32acc; } if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { return ctx->device->pipeline_matmul_id_f16.f32acc; } } // MMQ if (src1_type == GGML_TYPE_Q8_1) { vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_id_q8_1[src0_type].f32acc; if (pipelines->is_empty()) { return nullptr; } return pipelines; } GGML_ASSERT(src1_type == GGML_TYPE_F32 || (ctx->device->coopmat2 && src1_type == GGML_TYPE_F16)); switch (src0_type) { case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_NL: case GGML_TYPE_MXFP4: case GGML_TYPE_NVFP4: break; default: return nullptr; } vk_matmul_pipeline2& mmp = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type]; // XXX TODO 'prec' is not actually allowed in mul_mat_id. bool prefer_fp16acc = ctx->device->fp16 /*&& prec == GGML_PREC_DEFAULT*/; bool support_fp16acc = !mmp.f16acc->is_empty(); bool support_fp32acc = !mmp.f32acc->is_empty(); if (support_fp16acc && (prefer_fp16acc || !support_fp32acc)) { return mmp.f16acc; } else { GGML_ASSERT(support_fp32acc); return mmp.f32acc; } } static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type, uint32_t m, uint32_t k) { VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec_id()"); GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_Q8_1); if (b_type == GGML_TYPE_Q8_1) { switch (a_type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_MXFP4: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_M: break; default: return nullptr; } } switch (a_type) { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_NL: case GGML_TYPE_MXFP4: case GGML_TYPE_NVFP4: break; default: return nullptr; } // heuristic to choose workgroup size uint32_t dmmv_wg = DMMV_WG_SIZE_SUBGROUP; if ((ctx->device->vendor_id == VK_VENDOR_ID_NVIDIA && ctx->device->architecture != vk_device_architecture::NVIDIA_PRE_TURING) || ctx->device->vendor_id == VK_VENDOR_ID_INTEL) { // Prefer larger workgroups when M is small, to spread the work out more // and keep more SMs busy. // q6_k seems to prefer small workgroup size even for "medium" values of M. if (a_type == GGML_TYPE_Q6_K) { if (m < 4096 && k >= 1024) { dmmv_wg = DMMV_WG_SIZE_LARGE; } } else { if (m <= 8192 && k >= 1024) { dmmv_wg = DMMV_WG_SIZE_LARGE; } } } if (b_type == GGML_TYPE_Q8_1) { if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL) { dmmv_wg = DMMV_WG_SIZE_SUBGROUP; } return ctx->device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[dmmv_wg][a_type]; } return ctx->device->pipeline_dequant_mul_mat_vec_id_f32[dmmv_wg][a_type]; } static void * ggml_vk_host_malloc(vk_device& device, size_t size) { VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")"); vk_buffer buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) { fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n", size/1024.0/1024.0); device->device.freeMemory(buf->device_memory); device->device.destroyBuffer(buf->buffer); return nullptr; } std::lock_guard guard(device->mutex); device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf)); return buf->ptr; } static void ggml_vk_host_free(vk_device& device, void* ptr) { if (ptr == nullptr) { return; } VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")"); std::lock_guard guard(device->mutex); vk_buffer buf; size_t index; for (size_t i = 0; i < device->pinned_memory.size(); i++) { const uint8_t* addr = (const uint8_t*) std::get<0>(device->pinned_memory[i]); const uint8_t* endr = addr + std::get<1>(device->pinned_memory[i]); if (ptr >= addr && ptr < endr) { buf = std::get<2>(device->pinned_memory[i]); index = i; break; } } if (buf == nullptr) { fprintf(stderr, "WARNING: failed to free pinned memory: memory not in map\n"); return; } ggml_vk_destroy_buffer(buf); device->pinned_memory.erase(device->pinned_memory.begin() + index); } static void ggml_vk_host_get(const vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset) { std::lock_guard guard(device->mutex); buf = nullptr; buf_offset = 0; for (size_t i = 0; i < device->pinned_memory.size(); i++) { const uint8_t* addr = (const uint8_t*) std::get<0>(device->pinned_memory[i]); const uint8_t* endr = addr + std::get<1>(device->pinned_memory[i]); if (ptr >= addr && ptr < endr) { buf = std::get<2>(device->pinned_memory[i]); buf_offset = ((const uint8_t *)ptr) - addr; break; } } } static vk_subbuffer ggml_vk_tensor_subbuffer( const ggml_backend_vk_context * ctx, const ggml_tensor * tensor, bool allow_misalign = false) { vk_buffer buffer = nullptr; size_t offset = 0; if (ctx->device->uma) { ggml_vk_host_get(ctx->device, tensor->data, buffer, offset); } if (!buffer) { auto buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; buffer = buf_ctx->dev_buffer; offset = vk_tensor_offset(tensor) + tensor->view_offs; } GGML_ASSERT(buffer != nullptr); size_t size = ggml_nbytes(tensor); size_t misalign_bytes = offset & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1); // The shader must support misaligned offsets when indexing into the buffer GGML_ASSERT(allow_misalign || misalign_bytes == 0); offset &= ~misalign_bytes; size += misalign_bytes; return vk_subbuffer{buffer, offset, size}; } // Get a command buffer from pool. Create a new one if no reusable buffer is available static vk_command_buffer* ggml_vk_get_or_create_cmd_buffer(vk_device& device, vk_command_pool& pool) { for (auto& cmd_buffer : pool.cmd_buffers) { if (!cmd_buffer.in_use) { cmd_buffer.use_counter++; cmd_buffer.in_use = true; return &cmd_buffer; } } return ggml_vk_create_cmd_buffer(device, pool); } static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) { vk_submission s; s.buffer = ggml_vk_get_or_create_cmd_buffer(device, p); if (one_time) { s.buffer->buf.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); } else { s.buffer->buf.begin({ vk::CommandBufferUsageFlags{} }); } return s; } template size_t push_constant_size(const T &t) { static_assert(std::is_class::value, "T must be a struct/class"); GGML_UNUSED(t); return sizeof(T); } template size_t push_constant_size(const std::vector &t) { GGML_UNUSED(t); return sizeof(T) * t.size(); } template size_t push_constant_size(const std::array &t) { GGML_UNUSED(t); return sizeof(T) * N; } template const T *push_constant_data(const T &t) { static_assert(std::is_class::value, "T must be a struct/class"); return &t; } template const T *push_constant_data(const std::vector &t) { return t.data(); } template const T *push_constant_data(const std::array &t) { return t.data(); } template static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list const& descriptor_buffer_infos, const T &push_constants, std::array elements) { const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]); const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]); const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]); VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {"; for (auto& buffer : descriptor_buffer_infos) { std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), "; } std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"); GGML_ASSERT(wg0 <= ctx->device->properties.limits.maxComputeWorkGroupCount[0] && wg1 <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] && wg2 <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]); GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size()); GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT); GGML_ASSERT(pipeline->parameter_count == descriptor_buffer_infos.size()); GGML_ASSERT(pipeline->push_constant_size == push_constant_size(push_constants)); vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++]; vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() }; ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {}); subctx->s->buffer->buf.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants)); subctx->s->buffer->buf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline); subctx->s->buffer->buf.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipeline->layout, 0, { descriptor_set }, {}); subctx->s->buffer->buf.dispatch(wg0, wg1, wg2); } static void ggml_vk_end_submission(vk_submission& s, std::vector wait_semaphores, std::vector signal_semaphores) { s.buffer->buf.end(); s.wait_semaphores = std::move(wait_semaphores); s.signal_semaphores = std::move(signal_semaphores); } static void ggml_vk_ctx_end(vk_context& ctx) { VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")"); if (ctx->s == nullptr) { return; } ctx->s->buffer->buf.end(); ctx->s = nullptr; } static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) { VK_LOG_DEBUG("ggml_vk_ctx_begin(" << device->name << ")"); if (subctx->s != nullptr) { ggml_vk_ctx_end(subctx); } subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) }); subctx->s = subctx->seqs[subctx->seqs.size() - 1].data(); } static vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx) { vk_context result; if (!ctx->compute_ctx.expired()) { result = ctx->compute_ctx.lock(); } else { result = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); ctx->compute_ctx = result; ggml_vk_ctx_begin(ctx->device, result); } if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) { result->s->wait_semaphores.push_back(ctx->transfer_semaphore); ctx->transfer_semaphore_last_submitted = ctx->transfer_semaphore.value; } return result; } // Submit any pending transfer queue work and signal the transfer semaphore. // The next compute context created via ggml_vk_get_compute_ctx will wait on this semaphore. // Returns true if work was submitted. static bool ggml_vk_submit_transfer_ctx(ggml_backend_vk_context * ctx) { if (!ctx->device->async_use_transfer_queue || ctx->transfer_ctx.expired()) { return false; } vk_context cpy_ctx = ctx->transfer_ctx.lock(); ggml_vk_ctx_end(cpy_ctx); for (auto& cpy : cpy_ctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } ctx->transfer_semaphore.value++; cpy_ctx->seqs.back().back().signal_semaphores.push_back(ctx->transfer_semaphore); ggml_vk_submit(cpy_ctx, {}); ctx->transfer_ctx.reset(); return true; } static size_t ggml_vk_align_size(size_t width, size_t align) { VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")"); return CEIL_DIV(width, align) * align; } static void deferred_memcpy(void * dst, const void * src, size_t size, std::vector* memcpys = nullptr) { if (memcpys == nullptr) { memcpy(dst, src, size); } else { memcpys->emplace_back(dst, src, size); } } static void deferred_memset(void * dst, uint32_t val, size_t size, std::vector* memsets = nullptr) { if (memsets == nullptr) { memset(dst, val, size); } else { memsets->emplace_back(dst, val, size); } } static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) { if (device->sync_staging == nullptr || device->sync_staging->size < size) { VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")"); ggml_vk_destroy_buffer(device->sync_staging); device->sync_staging = ggml_vk_create_buffer_check(device, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); } } static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) { if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) { VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")"); ggml_vk_destroy_buffer(ctx->sync_staging); ctx->sync_staging = ggml_vk_create_buffer_check(ctx->device, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); } } static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")"); GGML_ASSERT(!ggml_is_contiguous(tensor)); // Buffer is already mapped if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl; GGML_ABORT("fatal error"); } // Check if src is pinned memory vk_buffer buf = nullptr; size_t buf_offset = 0; ggml_vk_host_get(ctx->device, tensor->data, buf, buf_offset); const uint64_t ne0 = tensor->ne[0]; const uint64_t ne1 = tensor->ne[1]; const uint64_t ne2 = tensor->ne[2]; const uint64_t ne3 = tensor->ne[3]; const uint64_t nb0 = tensor->nb[0]; const uint64_t nb1 = tensor->nb[1]; const uint64_t nb2 = tensor->nb[2]; const uint64_t nb3 = tensor->nb[3]; const ggml_type type = tensor->type; const uint64_t ts = ggml_type_size(type); const uint64_t bs = ggml_blck_size(type); const uint64_t dstnb0 = ts; const uint64_t dstnb1 = dstnb0*(ne0/bs); const uint64_t dstnb2 = dstnb1*ne1; const uint64_t dstnb3 = dstnb2*ne2; const uint64_t ne = ggml_nelements(tensor); if (buf != nullptr) { // Memory is pinned, use as staging buffer std::vector slices; for (uint64_t i3 = 0; i3 < ne3; i3++) { for (uint64_t i2 = 0; i2 < ne2; i2++) { // Find longest contiguous slice if (ne1*nb1 == dstnb2) { slices.push_back({ buf_offset + i3*nb3 + i2*nb2, offset + i3*dstnb3 + i2*dstnb2, dstnb2 }); } else { for (uint64_t i1 = 0; i1 < ne1; i1++) { if (ne0*nb0/bs == dstnb1) { slices.push_back({ buf_offset + i3*nb3 + i2*nb2 + i1*nb1, offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, dstnb1 }); } else { const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1; const uint64_t d_off = offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1; for (uint64_t i0 = 0; i0 < ne0; i0++) { slices.push_back({ s_off + i1*nb0, d_off + i0*dstnb0, dstnb0 }); } } } } } } ggml_vk_sync_buffers(ctx, subctx); subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices); return; } if (!sync_staging) { GGML_ABORT("Asynchronous write to non-pinned memory not supported"); } // Staging buffer required vk_buffer& staging = ctx->device->sync_staging; const uint64_t copy_size = ts*ne/bs; ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size); VkBufferCopy buf_copy{ 0, offset, copy_size }; ggml_vk_sync_buffers(ctx, subctx); vkCmdCopyBuffer(subctx->s->buffer->buf, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy); for (uint64_t i3 = 0; i3 < ne3; i3++) { for (uint64_t i2 = 0; i2 < ne2; i2++) { // Find longest contiguous slice if (ne1*nb1 == dstnb2) { deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys); } else { for (uint64_t i1 = 0; i1 < ne1; i1++) { if (ne0*nb0/bs == dstnb1) { deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys); } else { const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1; const uint64_t d_off = i3*dstnb3 + i2*dstnb2 + i1*dstnb1; for (uint64_t i0 = 0; i0 < ne0; i0++) { deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &subctx->in_memcpys); } } } } } } } static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")"); // Check if src is pinned memory vk_buffer buf = nullptr; size_t buf_offset = 0; ggml_vk_host_get(dst->device, src, buf, buf_offset); if (buf != nullptr) { // Memory is pinned, use as staging buffer std::vector slices(1); if (width == spitch && width == dpitch) { // Only do single write if stride is equal slices[0].srcOffset = buf_offset; slices[0].dstOffset = offset; slices[0].size = width * height; } else { slices.resize(height); for (size_t i = 0; i < height; i++) { slices[i].srcOffset = buf_offset + i * spitch; slices[i].dstOffset = offset + i * dpitch; slices[i].size = width; } } ggml_vk_sync_buffers(nullptr, subctx); subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices); return true; } VK_LOG_DEBUG("STAGING"); if (!sync_staging) { // copy was not handled caller needs to fall back return false; } // Staging buffer required const size_t staging_size = width * height; ggml_vk_ensure_sync_staging_buffer(dst->device, staging_size); vk_buffer& staging_buffer = dst->device->sync_staging; std::vector slices(1); if (width == dpitch) { slices[0].srcOffset = 0; slices[0].dstOffset = offset; slices[0].size = staging_size; } else { slices.resize(height); for (size_t i = 0; i < height; i++) { slices[i].srcOffset = i * width; slices[i].dstOffset = offset + i * dpitch; slices[i].size = width; } } ggml_vk_sync_buffers(nullptr, subctx); subctx->s->buffer->buf.copyBuffer((VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, slices); if (width == spitch) { deferred_memcpy((uint8_t *)staging_buffer->ptr, src, staging_size, &subctx->in_memcpys); } else { for (size_t i = 0; i < height; i++) { deferred_memcpy((uint8_t *)staging_buffer->ptr + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys); } } return true; } static bool ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")"); return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, size, 1, sync_staging); } static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height) { VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")"); // Buffer is already mapped if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); for (size_t i = 0; i < height; i++) { memcpy((uint8_t *)dst->ptr + offset + i * dpitch, (const uint8_t *) src + i * spitch, width); } } else { std::lock_guard guard(dst->device->mutex); vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool); ggml_vk_ctx_begin(dst->device, subctx); bool ret = ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, dpitch, width, height, true); GGML_ASSERT(ret); ggml_vk_ctx_end(subctx); for (auto& cpy : subctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } for (auto& mset : subctx->memsets) { memset(mset.dst, mset.val, mset.n); } ggml_vk_submit(subctx, dst->device->fence); VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences"); dst->device->device.resetFences({ dst->device->fence }); ggml_vk_queue_command_pools_cleanup(dst->device); } } static void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")"); ggml_vk_buffer_write_2d(dst, offset, src, size, size, size, 1); } static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")"); GGML_ASSERT(width > 0); GGML_ASSERT(height > 0); GGML_ASSERT(src != nullptr); // TODO: staging_offset is not used // Check if dst is pinned memory vk_buffer buf = nullptr; size_t buf_offset = 0; ggml_vk_host_get(src->device, dst, buf, buf_offset); std::vector slices(1); if (width == spitch && width == dpitch) { // Only do single write if stride is equal slices[0].srcOffset = offset; slices[0].dstOffset = buf_offset; slices[0].size = width * height; } else { slices.resize(height); for (size_t i = 0; i < height; i++) { slices[i].srcOffset = offset + i * spitch; slices[i].dstOffset = buf_offset + i * dpitch; slices[i].size = width; } } if (buf != nullptr) { // Memory is pinned, use as staging buffer ggml_vk_sync_buffers(nullptr, subctx); subctx->s->buffer->buf.copyBuffer(src->buffer, buf->buffer, slices); return true; } VK_LOG_DEBUG("STAGING"); if (!sync_staging) { // copy was not handled caller needs to fall back return false; } // Fall back to staging buffer const size_t staging_size = width * height; ggml_vk_ensure_sync_staging_buffer(src->device, staging_size); vk_buffer& staging_buffer = src->device->sync_staging; std::vector staging_slices(1); if (width == spitch) { staging_slices[0].srcOffset = offset; staging_slices[0].dstOffset = 0; staging_slices[0].size = staging_size; } else { staging_slices.resize(height); for (size_t i = 0; i < height; i++) { staging_slices[i].srcOffset = offset + i * spitch; staging_slices[i].dstOffset = i * width; staging_slices[i].size = width; } } ggml_vk_sync_buffers(nullptr, subctx); subctx->s->buffer->buf.copyBuffer(src->buffer, staging_buffer->buffer, staging_slices); if (width == dpitch) { deferred_memcpy(dst, staging_buffer->ptr, staging_size, &subctx->out_memcpys); } else { for (size_t i = 0; i < height; i++) { deferred_memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) staging_buffer->ptr + i * width, width, &subctx->out_memcpys); } } return true; } static bool ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t size, bool sync_staging = false) { return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, sync_staging); } static void ggml_vk_buffer_read_2d(vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height) { VK_LOG_DEBUG("ggml_vk_buffer_read_2d(" << src->buffer << ", " << offset << ", " << width << ", " << height << ")"); // If the device is not an UMA device the memory is host-accessible through rebar. While writing // through PCIe is sufficient fast reading back data from PCIe is slower than going through // the HW device to host copy path. if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) { GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); for (size_t i = 0; i < height; i++) { memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) src->ptr + offset + i * spitch, width); } } else { std::lock_guard guard(src->device->mutex); vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool); ggml_vk_ctx_begin(src->device, subctx); bool ret = ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, spitch, dpitch, width, height, true); GGML_ASSERT(ret); ggml_vk_ctx_end(subctx); ggml_vk_submit(subctx, src->device->fence); VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read_2d waitForFences"); src->device->device.resetFences({ src->device->fence }); ggml_vk_queue_command_pools_cleanup(src->device); for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } } } static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")"); ggml_vk_buffer_read_2d(src, offset, dst, size, size, size, 1); } static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")"); // Make sure both buffers are on same device GGML_ASSERT(src->device == dst->device); VkBufferCopy bc{ src_offset, dst_offset, size }; vkCmdCopyBuffer(ctx->s->buffer->buf, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc); } static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { if (src->device == dst->device) { std::lock_guard guard(src->device->mutex); VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")"); // Copy within the device vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool); ggml_vk_ctx_begin(src->device, subctx); ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size); ggml_vk_ctx_end(subctx); ggml_vk_submit(subctx, src->device->fence); VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences"); src->device->device.resetFences({ src->device->fence }); ggml_vk_queue_command_pools_cleanup(src->device); } else { VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")"); // Copy device to device ggml_vk_ensure_sync_staging_buffer(src->device, size); // Copy to src staging buffer ggml_vk_buffer_copy(src->device->sync_staging, 0, src, src_offset, size); // Copy to dst buffer ggml_vk_buffer_write(dst, dst_offset, src->device->sync_staging->ptr, size); } } static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_memset_async(" << offset << ", " << c << ", " << size << ")"); if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && dst->device->uma) { deferred_memset((uint8_t*)dst->ptr + offset, c, size, &ctx->memsets); return; } // Fall back to GPU fillBuffer for non-UMA or non-host-visible buffers ctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c); } static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")"); if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && dst->device->uma) { memset((uint8_t*)dst->ptr + offset, c, size); return; } std::lock_guard guard(dst->device->mutex); vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool); ggml_vk_ctx_begin(dst->device, subctx); subctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c); ggml_vk_ctx_end(subctx); ggml_vk_submit(subctx, dst->device->fence); VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences"); dst->device->device.resetFences({ dst->device->fence }); ggml_vk_queue_command_pools_cleanup(dst->device); } static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, uint32_t m, uint32_t n, uint32_t k, bool disable_split_k, const vk_pipeline& pipeline) { VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ", " << disable_split_k << ")"); if (disable_split_k) { return 1; } uint32_t split_k = 1; if (ctx->device->shader_core_count != 0 && m >= pipeline->wg_denoms[0] && n >= pipeline->wg_denoms[1]) { // If k is 'large' and the SMs will fill less than halfway, use split_k. uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]); uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]); if (k >= 2048) { if (m_tiles * n_tiles <= ctx->device->shader_core_count / 2) { split_k = ctx->device->shader_core_count / (m_tiles * n_tiles); } else if (m_tiles * n_tiles <= ctx->device->shader_core_count * 2 / 3) { split_k = 3; } // Cap the split at 8x. Unless k is huge this is a lot of overhead. split_k = std::min(split_k, 8u); // ggml_vk_matmul will align the splits to be a multiple of 256. // If this rounded up size would cause the last split to be empty, // then reduce the split count. while (true) { if (split_k == 1) { break; } uint32_t k_split = CEIL_DIV(k, split_k); k_split = ROUNDUP_POW2(k_split, 256); if (k_split * (split_k - 1) < k) { break; } split_k--; } } } return split_k; } static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type, ggml_type src1_type) { VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")"); if (ctx->device->coopmat2) { const uint32_t shader_core_count = ctx->device->shader_core_count; const uint32_t tiles_l = CEIL_DIV(m, mmp->a_l->wg_denoms[0]) * CEIL_DIV(n, mmp->a_l->wg_denoms[1]); const uint32_t tiles_m = CEIL_DIV(m, mmp->a_m->wg_denoms[0]) * CEIL_DIV(n, mmp->a_m->wg_denoms[1]); // Use large shader when the N dimension is greater than the medium shader's tile size uint32_t crossover_large = mmp->m->wg_denoms[1]; // Prefer large over medium if either: // - medium or large tiles would overfill the GPU // - large tiles with a split_k==3 fits in the GPU and medium tiles with split_k==2 does not // (medium with split_k==2 is probably better if it fits - more workgroups running and less split_k overhead) bool prefer_large = tiles_m > shader_core_count || tiles_l > shader_core_count || // split_k==3 with large tiles likely better than medium tiles with no split_k. (tiles_l <= shader_core_count / 3 && tiles_m > shader_core_count / 2); if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large && prefer_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) { return aligned ? mmp->a_l : mmp->l; } // Use medium shader when the N dimension is greater than the small shader's tile size uint32_t crossover_medium = mmp->s->wg_denoms[1]; if ((ctx->device->mul_mat_m[src0_type] && (n > crossover_medium)) || !ctx->device->mul_mat_s[src0_type]) { return aligned ? mmp->a_m : mmp->m; } return aligned ? mmp->a_s : mmp->s; } if ((ctx->device->mul_mat_s[src0_type] && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_l[src0_type])) { return aligned ? mmp->a_s : mmp->s; } if ((ctx->device->mul_mat_m[src0_type] && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_l[src0_type]) { return aligned ? mmp->a_m : mmp->m; } return aligned ? mmp->a_l : mmp->l; GGML_UNUSED(src1_type); } static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type, ggml_type src1_type) { VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")"); return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true, src0_type, src1_type)->align; } static void ggml_vk_matmul( ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t padded_n) { VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", padded_n: " << padded_n << ")"); if (split_k == 1) { ggml_pipeline_request_descriptor_sets(ctx, pipeline, CEIL_DIV(batch, ctx->device->properties.limits.maxComputeWorkGroupCount[2])); uint32_t base_work_group_z = 0; while (base_work_group_z < batch) { uint32_t groups_z = std::min(batch - base_work_group_z, ctx->device->properties.limits.maxComputeWorkGroupCount[2]); const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k, ne02, ne12, broadcast2, broadcast3, padded_n }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, groups_z }); base_work_group_z += groups_z; } return; } if (ctx->prealloc_split_k_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } GGML_ASSERT(batch_stride_d == m * n); // Round the split size up to a multiple of 256 (k-quant alignment) uint32_t k_split = CEIL_DIV(k, split_k); k_split = ROUNDUP_POW2(k_split, 256); ggml_pipeline_request_descriptor_sets(ctx, pipeline, CEIL_DIV(batch, ctx->device->properties.limits.maxComputeWorkGroupCount[2])); uint32_t base_work_group_z = 0; while (base_work_group_z < batch) { uint32_t groups_z = std::min(batch - base_work_group_z, ctx->device->properties.limits.maxComputeWorkGroupCount[2]); const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k_split, ne02, ne12, broadcast2, broadcast3, padded_n }; // Make sure enough workgroups get assigned for split k to work ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, groups_z }); base_work_group_z += groups_z; } ggml_vk_sync_buffers(ctx, subctx); const std::array pc2 = { (uint32_t)(m * n * batch), split_k }; ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 }); ctx->prealloc_split_k_need_sync = true; } static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type) { VK_LOG_DEBUG("ggml_vk_guess_matmul_id_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")"); if (ctx->device->coopmat2) { // Use large shader when the N dimension is greater than the medium shader's tile size uint32_t crossover_large = mmp->m->wg_denoms[1]; if ((ctx->device->mul_mat_id_l[src0_type] && (n > crossover_large)) || (!ctx->device->mul_mat_id_m[src0_type] && !ctx->device->mul_mat_id_s[src0_type])) { return aligned ? mmp->a_l : mmp->l; } // Use medium shader when the N dimension is greater than the small shader's tile size uint32_t crossover_medium = mmp->s->wg_denoms[1]; if ((ctx->device->mul_mat_id_m[src0_type] && (n > crossover_medium)) || !ctx->device->mul_mat_id_s[src0_type]) { return aligned ? mmp->a_m : mmp->m; } return aligned ? mmp->a_s : mmp->s; } if ((ctx->device->mul_mat_id_s[src0_type] && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_id_m[src0_type] && !ctx->device->mul_mat_id_l[src0_type])) { return aligned ? mmp->a_s : mmp->s; } if ((ctx->device->mul_mat_id_m[src0_type] && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_id_l[src0_type]) { return aligned ? mmp->a_m : mmp->m; } return aligned ? mmp->a_l : mmp->l; } static uint32_t ggml_vk_guess_matmul_id_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type) { VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ", " << ggml_type_name(src0_type) << ")"); return ggml_vk_guess_matmul_id_pipeline(ctx, mmp, m, n, true, src0_type)->align; } static void ggml_vk_matmul_id( ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids, const vk_subbuffer & expert_count_buf, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11, uint32_t padded_n) { VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), expert_count: (" << expert_count_buf.buffer->buffer << ", " << expert_count_buf.offset << ", " << expert_count_buf.size << "), " << "m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " << "batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " << "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")"); const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, nei0, nei1, nbi1, ne11, padded_n }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids, expert_count_buf }, pc, { m, nei1, n_as }); } static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) { return tensor->nb[0] == ggml_type_size(tensor->type) && tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && (tensor->ne[3] == 1 || tensor->nb[3] == tensor->nb[2]*tensor->ne[2]); } static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) { // Choose "contiguous copy" shader if src/dst are contiguous bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst)); // Use optimized "transpose" shader if src dim1 is the innermost dimension. bool transpose = dst && src->nb[1] == ggml_type_size(to) && ggml_are_same_shape(dst, src); if (transpose && src->type == to) { if (ggml_type_size(to) == 4) { return ctx->device->pipeline_cpy_transpose_32; } else if (ggml_type_size(to) == 2) { return ctx->device->pipeline_cpy_transpose_16; } } if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) { if (contig) { return ctx->device->pipeline_contig_cpy_f32_f32; } else { return ctx->device->pipeline_cpy_f32_f32; } } if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F16) { if (contig) { return ctx->device->pipeline_contig_cpy_f32_f16; } else { return ctx->device->pipeline_cpy_f32_f16; } } if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F16) { if (contig) { return ctx->device->pipeline_contig_cpy_f16_f16; } else { return ctx->device->pipeline_cpy_f16_f16; } } if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F32) { if (contig) { return ctx->device->pipeline_contig_cpy_f16_f32; } else { return ctx->device->pipeline_cpy_f16_f32; } } if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_BF16) { if (contig) { return ctx->device->pipeline_contig_cpy_f32_bf16; } else { return ctx->device->pipeline_cpy_f32_bf16; } } if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_I32) { if (contig) { return ctx->device->pipeline_contig_cpy_f32_i32; } else { return ctx->device->pipeline_cpy_f32_i32; } } if (src->type == GGML_TYPE_I32 && to == GGML_TYPE_F32) { if (contig) { return ctx->device->pipeline_contig_cpy_i32_f32; } else { return ctx->device->pipeline_cpy_i32_f32; } } if (src->type == GGML_TYPE_F32) { switch (to) { case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_IQ4_NL: return ctx->device->pipeline_cpy_f32_quant[to]; default: break; } } if (to == GGML_TYPE_F32) { switch (src->type) { case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_IQ4_NL: return ctx->device->pipeline_cpy_quant_f32[src->type]; default: break; } } if (src->type == to) { // Copy two or four bytes at a time, depending on block size. // For quantized types, we scale by block size/type size. But // this path is also used for bf16->bf16 for example, where the // type size must be exactly 2 or 4. GGML_ASSERT(ggml_is_quantized(to) || ggml_type_size(src->type) == 2 || ggml_type_size(src->type) == 4); if ((ggml_type_size(src->type) % 4) == 0) { if (contig) { return ctx->device->pipeline_contig_cpy_f32_f32; } else { return ctx->device->pipeline_cpy_f32_f32; } } else { if (contig) { return ctx->device->pipeline_contig_cpy_f16_f16; } else { return ctx->device->pipeline_cpy_f16_f16; } } } std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl; GGML_ABORT("fatal error"); } static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, const vk_subbuffer & in, const vk_subbuffer & out) { VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")"); const int tensor_type_size = ggml_type_size(tensor->type); const uint32_t ne = ggml_nelements(tensor); std::array elements; if (ne > 262144) { elements = { 512, 512, CEIL_DIV(ne, 262144) }; } else if (ne > 512) { elements = { 512, CEIL_DIV(ne, 512), 1 }; } else { elements = { ne, 1, 1 }; } vk_op_unary_push_constants pc = { (uint32_t)ne, (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size, (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], 1 , (uint32_t)tensor->ne[0] , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]), 0, 0.0f, 0.0f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; init_pushconst_fastdiv(pc); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements); ggml_vk_sync_buffers(ctx, subctx); } static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) { switch(type) { case GGML_TYPE_Q8_1: return ctx->device->pipeline_quantize_q8_1_x4; default: std::cerr << "Missing quantize pipeline for type: " << ggml_type_name(type) << std::endl; GGML_ABORT("fatal error"); } } static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& subctx, const vk_subbuffer & in, const vk_subbuffer & out, uint32_t ne) { VK_LOG_DEBUG("ggml_vk_quantize_q8_1(" << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ", " << ne << ")"); vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); const uint32_t num_blocks = CEIL_DIV(ne, pipeline->wg_denoms[0]); // clamp the number of elements to the max workgroup count. The shader will iterate over the total number of blocks. const uint64_t max_elements = std::min(uint64_t{ctx->device->properties.limits.maxComputeWorkGroupCount[0]} * pipeline->wg_denoms[0], std::numeric_limits::max()); const uint32_t elements = std::min(ne, static_cast(max_elements)); const vk_quantize_q8_1_push_constants pc = { ne, num_blocks, }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, { elements, 1, 1 }); ggml_vk_sync_buffers(ctx, subctx); } static vk_pipeline ggml_vk_get_64b_indexing_pipeline(ggml_backend_vk_context * ctx, vk_pipeline &pipeline) { GGML_UNUSED(ctx); #if defined(VK_EXT_shader_64bit_indexing) vk_pipeline *ptr = &pipeline; while (*ptr) { if ((*ptr)->is_64b_indexing) { return *ptr; } ptr = &(*ptr)->next; } #endif return pipeline; } static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool disable_split_k) { VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << ggml_type_name(src0->type) << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << ggml_type_name(src1->type) << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << ggml_type_name(dst->type) << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; std::cerr << "))"); GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT const uint64_t ne00 = src0->ne[0]; const uint64_t ne01 = src0->ne[1]; const uint64_t ne02 = src0->ne[2]; const uint64_t ne03 = src0->ne[3]; const uint64_t ne10 = src1->ne[0]; const uint64_t ne11 = src1->ne[1]; const uint64_t ne12 = src1->ne[2]; const uint64_t ne13 = src1->ne[3]; const uint64_t ne21 = dst->ne[1]; const uint32_t stride_d = dst->nb[1] / ggml_type_size(dst->type); const uint32_t stride_batch_d = stride_d*ne21; const uint64_t r2 = ne12 / ne02; const uint64_t r3 = ne13 / ne03; ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; vk_buffer d_Qx = nullptr; size_t qx_buf_offset = 0; vk_buffer d_Qy = nullptr; size_t qy_buf_offset = 0; bool src0_uma = false; bool src1_uma = false; if (ctx->device->uma) { ggml_vk_host_get(ctx->device, src0->data, d_Qx, qx_buf_offset); ggml_vk_host_get(ctx->device, src1->data, d_Qy, qy_buf_offset); src0_uma = d_Qx != nullptr; src1_uma = d_Qy != nullptr; } // Reformat and convert to fp16 if non-contiguous, or for coopmat2 for better perf const bool x_non_contig = (ctx->device->coopmat2 && src0->type == GGML_TYPE_F32) || !ggml_vk_dim01_contiguous(src0); const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) || (src0->type == GGML_TYPE_BF16 && src1->type != GGML_TYPE_BF16) || !ggml_vk_dim01_contiguous(src1); // If src0 is BF16, try to use a BF16 x BF16 multiply ggml_type f16_type = src0->type == GGML_TYPE_BF16 ? GGML_TYPE_BF16 : GGML_TYPE_F16; const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && !y_non_contig && (ne11 * ne10) % 4 == 0; // Check for mmq first vk_matmul_pipeline mmp = quantize_y ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, GGML_TYPE_Q8_1, (ggml_prec)dst->op_params[0]) : nullptr; if (mmp == nullptr) { // Fall back to f16 dequant mul mat mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? f16_type : src1->type, (ggml_prec)dst->op_params[0]); quantize_y = false; } const bool qx_needs_dequant = mmp == nullptr || x_non_contig; const bool qy_needs_dequant = !quantize_y && ((src1->type != f16_type && !y_f32_kernel) || y_non_contig); if (qx_needs_dequant) { // Fall back to dequant + f16 mulmat mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, f16_type, y_f32_kernel ? GGML_TYPE_F32 : f16_type, (ggml_prec)dst->op_params[0]); } // Not implemented GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT const uint32_t kpad = quantize_y ? 0 : ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, qx_needs_dequant ? f16_type : src0->type, quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type))); const bool aligned = !quantize_y && ne10 == kpad && ne01 > 8 && ne11 > 8; vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? f16_type : src0->type, quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type)); if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); } // Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) : ne11; const uint64_t x_ne = ggml_nelements(src0); // 128 elements per Q8_1 x4 block const uint64_t y_ne = padded_n * ne10 * ne12 * ne13; const uint64_t d_ne = ggml_nelements(dst); const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, disable_split_k, pipeline); const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne; const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); const uint64_t d_sz = sizeof(float) * d_ne; vk_pipeline to_fp16_vk_0 = nullptr; vk_pipeline to_fp16_vk_1 = nullptr; vk_pipeline to_q8_1 = nullptr; if (x_non_contig) { to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, f16_type); } else { to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); } if (y_non_contig) { to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, f16_type); } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT if (quantize_y) { to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); } { const uint64_t split_k_size = split_k > 1 ? d_sz * split_k : 0; if ( (qx_needs_dequant && x_sz > ctx->device->properties.limits.maxStorageBufferRange) || (qy_needs_dequant && y_sz > ctx->device->properties.limits.maxStorageBufferRange) || (split_k > 1 && split_k_size > ctx->device->properties.limits.maxStorageBufferRange)) { GGML_ABORT("Requested preallocation size is too large"); } if (qx_needs_dequant && ctx->prealloc_size_x < x_sz) { ctx->prealloc_size_x = x_sz; ggml_vk_preallocate_buffers(ctx, subctx); } if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz) { ctx->prealloc_size_y = y_sz; ggml_vk_preallocate_buffers(ctx, subctx); } if (split_k > 1 && ctx->prealloc_size_split_k < split_k_size) { ctx->prealloc_size_split_k = split_k_size; ggml_vk_preallocate_buffers(ctx, subctx); } // Request descriptor sets if (qx_needs_dequant) { ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); } if (qy_needs_dequant) { ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); } if (quantize_y) { ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1); } if (split_k > 1) { ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1); } } vk_buffer d_D = dst_buf_ctx->dev_buffer; const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D->size >= d_buf_offset + d_sz); vk_buffer d_X; uint64_t x_buf_offset = 0; vk_buffer d_Y; uint64_t y_buf_offset = 0; if (!src0_uma) { d_Qx = src0_buf_ctx->dev_buffer; qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); } if (!src1_uma) { d_Qy = src1_buf_ctx->dev_buffer; qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Qy != nullptr); } if (qx_needs_dequant) { d_X = ctx->prealloc_x; GGML_ASSERT(d_X->size >= x_sz); } else { d_X = d_Qx; x_buf_offset = qx_buf_offset; GGML_ASSERT(qx_sz == x_sz); } if (qy_needs_dequant) { d_Y = ctx->prealloc_y; GGML_ASSERT(d_Y->size >= y_sz); } else if (quantize_y) { d_Y = ctx->prealloc_y; GGML_ASSERT(d_Y->size >= CEIL_DIV(y_sz, 144) * 144); } else { d_Y = d_Qy; y_buf_offset = qy_buf_offset; GGML_ASSERT(qy_sz == y_sz); } if (x_non_contig || qx_needs_dequant) { if (ctx->prealloc_x_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } } if (x_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0)); } else if (qx_needs_dequant) { const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_X, 0, x_sz } }, pc, { (uint32_t)(x_ne), 1, 1}); ggml_vk_sync_buffers(ctx, subctx); } if (y_non_contig) { if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || ctx->prealloc_y_last_tensor_used != src1) { if (ctx->prealloc_y_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0)); ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); ctx->prealloc_y_last_tensor_used = src1; } } if (quantize_y) { if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() || ctx->prealloc_y_last_tensor_used != src1) { if (ctx->prealloc_y_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne); ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); ctx->prealloc_y_last_tensor_used = src1; } } uint32_t stride_batch_x = ne00*ne01; uint32_t stride_batch_y = ne10*ne11; if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); } if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant && !quantize_y) { stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); } // compute ggml_vk_matmul( ctx, subctx, pipeline, { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, ggml_vk_subbuffer(ctx, d_D, d_buf_offset), { ctx->prealloc_split_k, 0, d_sz * split_k }, ne01, ne11, ne10, ne10, ne10, stride_d, stride_batch_x, stride_batch_y, stride_batch_d, split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n ); // NOLINT if (x_non_contig || qx_needs_dequant) { ctx->prealloc_x_need_sync = true; } if (y_non_contig || quantize_y) { ctx->prealloc_y_need_sync = true; } } // Device tuning static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_t n, uint32_t k, ggml_type src0_type) { if (device->mmvq_mode == 1) { return true; } else if (device->mmvq_mode == -1) { return false; } // General performance issue with q3_k and q6_k due to 2-byte alignment if (src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q6_K) { return false; } // MMVQ is generally good for batches if (n > 1) { return true; } // Quantization overhead is not worth it for small k switch (device->vendor_id) { case VK_VENDOR_ID_NVIDIA: if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_IQ1_S || src0_type == GGML_TYPE_IQ1_M) { return true; } if (k <= 4096) { return false; } switch (src0_type) { case GGML_TYPE_MXFP4: case GGML_TYPE_Q8_0: return device->architecture == vk_device_architecture::NVIDIA_PRE_TURING; default: return true; } case VK_VENDOR_ID_AMD: if (k < 2048) { return false; } switch (src0_type) { case GGML_TYPE_Q8_0: return device->architecture == vk_device_architecture::AMD_GCN; default: return true; } case VK_VENDOR_ID_INTEL: if (device->driver_id == vk::DriverId::eIntelProprietaryWindows) { // Intel Windows proprietary driver MMVQ performance is worse than fp16, see // https://github.com/ggml-org/llama.cpp/issues/17628 return false; } if (k < 2048) { return false; } switch (src0_type) { // From tests on A770 Linux, may need more tuning case GGML_TYPE_Q4_0: case GGML_TYPE_Q5_1: return false; default: return true; } default: return true; } GGML_UNUSED(m); } static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { ggml_tensor * dst = cgraph->nodes[node_idx]; const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; std::cerr << ")),)"); GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT const uint64_t ne00 = src0->ne[0]; const uint64_t ne01 = src0->ne[1]; const uint64_t ne02 = src0->ne[2]; const uint64_t ne03 = src0->ne[3]; const uint64_t ne10 = src1->ne[0]; const uint64_t ne11 = src1->ne[1]; const uint64_t ne12 = src1->ne[2]; const uint64_t ne13 = src1->ne[3]; const uint64_t ne20 = dst->ne[0]; const uint64_t ne21 = dst->ne[1]; // const uint64_t ne22 = dst->ne[2]; // const uint64_t ne23 = dst->ne[3]; const uint64_t r2 = ne12 / ne02; const uint64_t r3 = ne13 / ne03; // batch_n indicates that we need to compute a few vector results, and this assumes // ne12 and ne13 are 1. It overloads the batch_strides to hold the row strides. GGML_ASSERT(ne11 == 1 || ne12 * ne13 == 1); bool batch_n = ne11 > 1; const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); const bool f16_f32_kernel = src1->type == GGML_TYPE_F32; bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && !y_non_contig && (ne11 * ne10) % 4 == 0 && ggml_vk_should_use_mmvq(ctx->device, ne01, ne11, ne10, src0->type); vk_pipeline to_fp16_vk_0 = nullptr; vk_pipeline to_fp16_vk_1 = nullptr; if (x_non_contig) { to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type); } if (y_non_contig) { to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type); } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } // Check for mmq first vk_pipeline dmmv = quantize_y ? ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, GGML_TYPE_Q8_1, ne11, ne20, ne00) : nullptr; vk_pipeline to_q8_1 = nullptr; if (dmmv == nullptr) { // Fall back to f16 dequant mul mat dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type, ne11, ne20, ne00); quantize_y = false; } if (quantize_y) { to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); } if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { dmmv = ggml_vk_get_64b_indexing_pipeline(ctx, dmmv); } const bool qx_needs_dequant = x_non_contig; const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig); // Not implemented GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT GGML_ASSERT(dmmv != nullptr); const uint64_t x_ne = ggml_nelements(src0); const uint64_t y_ne = ggml_nelements(src1); const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); { if ( (qx_needs_dequant && x_sz > ctx->device->properties.limits.maxStorageBufferRange) || (qy_needs_dequant && y_sz > ctx->device->properties.limits.maxStorageBufferRange)) { GGML_ABORT("Requested preallocation size is too large"); } if (qx_needs_dequant && ctx->prealloc_size_x < x_sz) { ctx->prealloc_size_x = x_sz; ggml_vk_preallocate_buffers(ctx, subctx); } if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz) { ctx->prealloc_size_y = y_sz; ggml_vk_preallocate_buffers(ctx, subctx); } // Request descriptor sets if (qx_needs_dequant) { ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); } if (qy_needs_dequant) { ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); } if (quantize_y) { ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1); } } vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]); vk_subbuffer d_Qx = ggml_vk_tensor_subbuffer(ctx, src0); vk_subbuffer d_Qy = ggml_vk_tensor_subbuffer(ctx, src1); vk_subbuffer d_X, d_Y; if (qx_needs_dequant) { d_X = { ctx->prealloc_x, 0, ctx->prealloc_x->size }; } else { d_X = d_Qx; GGML_ASSERT(qx_sz == x_sz); } if (qy_needs_dequant || quantize_y) { d_Y = { ctx->prealloc_y, 0, ctx->prealloc_y->size }; } else { d_Y = d_Qy; } if (x_non_contig) { if (ctx->prealloc_x_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, d_Qx, d_X); } if (y_non_contig) { GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || ctx->prealloc_y_last_tensor_used != src1) { if (ctx->prealloc_y_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, d_Qy, d_Y); ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); ctx->prealloc_y_last_tensor_used = src1; } } if (quantize_y) { if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() || ctx->prealloc_y_last_tensor_used != src1) { if (ctx->prealloc_y_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } ggml_vk_quantize_q8_1(ctx, subctx, d_Qy, d_Y, y_ne); ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); ctx->prealloc_y_last_tensor_used = src1; } } // For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride uint32_t stride_batch_x = batch_n ? 0 : ne00*ne01; uint32_t stride_batch_y = batch_n ? ne10 : (ne10*ne11); uint32_t stride_batch_d = batch_n ? ne20 : (ne20*ne21); if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); } if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) { stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); } const uint32_t max_groups_x = ctx->device->properties.limits.maxComputeWorkGroupCount[0]; uint32_t groups_x = ne01; uint32_t groups_z = 1; if (ne01 > max_groups_x) { groups_z = 64; groups_x = CEIL_DIV(groups_x, groups_z); } uint32_t fusion_flags = 0; vk_subbuffer d_F0 = d_D; if (ctx->num_additional_fused_ops > 0) { const ggml_tensor * add = cgraph->nodes[node_idx + 1]; const ggml_tensor * bias = add->src[0] == dst ? add->src[1] : add->src[0]; d_F0 = ggml_vk_tensor_subbuffer(ctx, bias); fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS0; } vk_subbuffer d_F1 = d_D; if (ctx->num_additional_fused_ops == 2) { const ggml_tensor * add = cgraph->nodes[node_idx + 2]; const ggml_tensor * bias = add->src[0] == cgraph->nodes[node_idx + 1] ? add->src[1] : add->src[0]; d_F1 = ggml_vk_tensor_subbuffer(ctx, bias); fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS1; } ggml_pipeline_request_descriptor_sets(ctx, dmmv, CEIL_DIV(ne12 * ne13, ctx->device->properties.limits.maxComputeWorkGroupCount[1])); uint32_t base_work_group_y = 0; while (base_work_group_y < ne12 * ne13) { uint32_t groups_y = std::min((uint32_t)(ne12 * ne13) - base_work_group_y, ctx->device->properties.limits.maxComputeWorkGroupCount[1]); const vk_mat_vec_push_constants pc = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01, stride_batch_x, stride_batch_y, stride_batch_d, fusion_flags, base_work_group_y, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3, }; ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { d_X, d_Y, d_D, d_F0, d_F1, }, pc, { groups_x, groups_y, groups_z }); base_work_group_y += groups_y; } if (x_non_contig) { ctx->prealloc_x_need_sync = true; } if (y_non_contig || quantize_y) { ctx->prealloc_y_need_sync = true; } } static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { ggml_tensor * dst = cgraph->nodes[node_idx]; const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32(" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; std::cerr << "))"); GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); const uint64_t ne00 = src0->ne[0]; const uint64_t ne01 = src0->ne[1]; const uint64_t ne02 = src0->ne[2]; // const uint64_t ne03 = src0->ne[3]; //const uint64_t ne10 = src1->ne[0]; const uint64_t ne11 = src1->ne[1]; const uint64_t ne12 = src1->ne[2]; // const uint64_t ne13 = src1->ne[3]; GGML_ASSERT(ne11 == 1); // With grouped query attention there are > 1 Q matrices per K, V matrix. uint32_t gqa_ratio = (uint32_t)ne12 / (uint32_t)ne02; if (gqa_ratio > 8 || gqa_ratio == 0 || ne12 != ne02 * gqa_ratio) { gqa_ratio = 1; } vk_pipeline pipeline = ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1]; if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); } { // Request descriptor sets ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); } vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops], true); vk_subbuffer d_Qx = ggml_vk_tensor_subbuffer(ctx, src0); vk_subbuffer d_Qy = ggml_vk_tensor_subbuffer(ctx, src1, true); vk_subbuffer d_F0 = d_D; uint32_t fusion_flags = 0; if (ctx->num_additional_fused_ops > 0) { const ggml_tensor * add = cgraph->nodes[node_idx + 1]; const ggml_tensor * bias = add->src[0] == dst ? add->src[1] : add->src[0]; d_F0 = ggml_vk_tensor_subbuffer(ctx, bias); fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS0; } vk_subbuffer d_F1 = d_D; if (ctx->num_additional_fused_ops > 1) { const ggml_tensor * bias = cgraph->nodes[node_idx + 2]->src[1]; d_F1 = ggml_vk_tensor_subbuffer(ctx, bias); fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS1; } // compute vk_mat_vec_p021_push_constants pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, 0, 0, fusion_flags }; init_pushconst_tensor_offsets(ctx, pc, src0, src1, nullptr, nullptr, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]); uint32_t workgroups_z = (uint32_t)ne12; // When gqa_ratio > 1, each invocation does multiple rows and we can launch fewer workgroups if (gqa_ratio > 1) { workgroups_z /= gqa_ratio; } ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { d_Qx, d_Qy, d_D, d_F0, d_F1, }, pc, { 1, (uint32_t)ne01, workgroups_z }); } static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { ggml_tensor * dst = cgraph->nodes[node_idx]; const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; std::cerr << "))"); GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); const uint64_t ne00 = src0->ne[0]; const uint64_t ne01 = src0->ne[1]; const uint64_t ne02 = src0->ne[2]; const uint64_t ne03 = src0->ne[3]; const uint64_t nb01 = src0->nb[1]; const uint64_t nb02 = src0->nb[2]; const uint64_t nb12 = src1->nb[2]; // const uint64_t ne10 = src1->ne[0]; const uint64_t ne11 = src1->ne[1]; const uint64_t ne12 = src1->ne[2]; // const uint64_t ne13 = src1->ne[3]; const uint32_t nb03 = (uint32_t)(src0->nb[3] / sizeof(ggml_fp16_t)); const uint32_t nb13 = (uint32_t)(src1->nb[3] / sizeof(float)); const uint32_t nb23 = (uint32_t)(dst->nb[3] / sizeof(float)); GGML_ASSERT(ne11 == 1); GGML_ASSERT(src0->ne[3] == src1->ne[3]); // checked in supports_op const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t); const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t); const uint32_t channel_stride_y = nb12 / sizeof(float); vk_pipeline pipeline = ctx->device->pipeline_mul_mat_vec_nc_f16_f32; if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); } { // Request descriptor sets ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); } vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops], true); vk_subbuffer d_Qx = ggml_vk_tensor_subbuffer(ctx, src0); vk_subbuffer d_Qy = ggml_vk_tensor_subbuffer(ctx, src1, true); vk_subbuffer d_F0 = d_D; uint32_t fusion_flags = 0; if (ctx->num_additional_fused_ops > 0) { const ggml_tensor * add = cgraph->nodes[node_idx + 1]; const ggml_tensor * bias = add->src[0] == dst ? add->src[1] : add->src[0]; d_F0 = ggml_vk_tensor_subbuffer(ctx, bias); fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS0; } vk_subbuffer d_F1 = d_D; if (ctx->num_additional_fused_ops > 1) { const ggml_tensor * bias = cgraph->nodes[node_idx + 2]->src[1]; d_F1 = ggml_vk_tensor_subbuffer(ctx, bias); fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS1; } // compute vk_mat_vec_nc_push_constants pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, 0, 0, nb03, nb13, nb23, fusion_flags }; init_pushconst_tensor_offsets(ctx, pc, src0, src1, nullptr, nullptr, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { d_Qx, d_Qy, d_D, d_F0, d_F1, }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 }); } static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { ggml_tensor * dst = cgraph->nodes[node_idx]; ggml_tensor * src0 = dst->src[0]; ggml_tensor * src1 = dst->src[1]; VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")"); // Handle huge A matrix by splitting the M dimensions. This works well for convolution use cases // where the M dimension is very large. // Split_k doesn't work with M splitting. // This only supports batchsize == 1. const size_t nbytes = ggml_nbytes(src0); const bool needs_split = dst->ne[2] == 1 && dst->ne[3] == 1 && nbytes > ctx->device->properties.limits.maxStorageBufferRange; if (needs_split) { // Choose the number of rows that can fit (and divide by two, to allow for any additional offsets) const uint32_t M_split = ctx->device->properties.limits.maxStorageBufferRange / (2 * src0->nb[1]); uint32_t m_offset = 0; while (m_offset < dst->ne[0]) { const uint32_t cur_M_size = std::min(M_split, (uint32_t)(dst->ne[0] - m_offset)); ggml_tensor dst2 = *dst; ggml_tensor src02 = *src0; dst2.view_src = dst->view_src ? dst->view_src : dst; src02.view_src = src0->view_src ? src0->view_src : src0; dst2.view_offs += m_offset * dst->nb[0]; src02.view_offs += m_offset * src0->nb[1]; dst2.ne[0] = cur_M_size; src02.ne[1] = cur_M_size; ggml_vk_mul_mat_q_f16(ctx, subctx, &src02, src1, &dst2, true); m_offset += cur_M_size; } } else if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1 && // detect 0213 permutation, and batch size of 1 src0->nb[0] <= src0->nb[2] && src0->nb[2] <= src0->nb[1] && src0->nb[1] <= src0->nb[3] && src1->nb[0] <= src1->nb[2] && src1->nb[2] <= src1->nb[1] && src1->nb[1] <= src1->nb[3] && src0->ne[3] == 1 && src1->ne[3] == 1 && src0->ne[1] <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] && src1->ne[2] <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]) { ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, cgraph, node_idx); } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 && !ggml_is_permuted(src0) && !ggml_is_permuted(src1) && src0->ne[3] <= ctx->device->properties.limits.maxComputeWorkGroupCount[0] && src0->ne[1] <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] && src1->ne[2] <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]) { ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, cgraph, node_idx); // mul_mat_vec supports batching ne12*ne13 when ne11==1, or treating ne11 as the batch size (up to four) // when ne12 and ne13 are one. } else if ((dst->ne[1] == 1 || (dst->ne[1] <= mul_mat_vec_max_cols && src1->ne[2] * src1->ne[3] == 1)) && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || ggml_is_quantized(src0->type))) { ggml_vk_mul_mat_vec_q_f16(ctx, subctx, cgraph, node_idx); } else { ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, false); } } static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT GGML_ASSERT(ids->type == GGML_TYPE_I32); const uint64_t ne00 = src0->ne[0]; const uint64_t ne01 = src0->ne[1]; const uint64_t ne02 = src0->ne[2]; // const uint64_t ne03 = src0->ne[3]; const uint64_t ne10 = src1->ne[0]; const uint64_t ne11 = src1->ne[1]; const uint64_t ne12 = src1->ne[2]; const uint64_t ne13 = src1->ne[3]; const uint64_t nei0 = ids->ne[0]; const uint64_t nei1 = ids->ne[1]; const uint32_t nbi0 = ids->nb[0]; const uint32_t nbi1 = ids->nb[1]; const uint32_t nbi2 = ids->nb[2]; const uint64_t ne20 = dst->ne[0]; const uint64_t ne21 = dst->ne[1]; // const uint64_t ne22 = dst->ne[2]; // const uint64_t ne23 = dst->ne[3]; const uint64_t n_as = ne02; ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context; vk_buffer d_Qx = nullptr; size_t qx_buf_offset = 0; vk_buffer d_Qy = nullptr; size_t qy_buf_offset = 0; vk_buffer d_ids = nullptr; size_t ids_buf_offset = 0; bool src0_uma = false; bool src1_uma = false; bool ids_uma = false; if (ctx->device->uma) { ggml_vk_host_get(ctx->device, src0->data, d_Qx, qx_buf_offset); ggml_vk_host_get(ctx->device, src1->data, d_Qy, qy_buf_offset); ggml_vk_host_get(ctx->device, ids->data, d_ids, ids_buf_offset); src0_uma = d_Qx != nullptr; src1_uma = d_Qy != nullptr; ids_uma = d_ids != nullptr; } // Reformat and convert to fp16 if non-contiguous, or for coopmat2 for better perf const bool x_non_contig = (ctx->device->coopmat2 && src0->type == GGML_TYPE_F32) || !ggml_vk_dim01_contiguous(src0); const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) || (src0->type == GGML_TYPE_BF16 && src1->type != GGML_TYPE_BF16) || !ggml_vk_dim01_contiguous(src1); // If src0 is BF16, try to use a BF16 x BF16 multiply ggml_type f16_type = src0->type == GGML_TYPE_BF16 ? GGML_TYPE_BF16 : GGML_TYPE_F16; const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && !y_non_contig && (ne11 * ne10) % 4 == 0; // Check for mmq first vk_matmul_pipeline mmp = quantize_y ? ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, GGML_TYPE_Q8_1, (ggml_prec)dst->op_params[0]) : nullptr; if (mmp == nullptr) { // Fall back to f16 dequant mul mat mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, y_non_contig ? f16_type : src1->type, (ggml_prec)dst->op_params[0]); quantize_y = false; } const bool qx_needs_dequant = mmp == nullptr || x_non_contig; const bool qy_needs_dequant = !quantize_y && ((src1->type != f16_type && !y_f32_kernel) || y_non_contig); if (qx_needs_dequant) { // Fall back to dequant + f16 mulmat mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, f16_type, y_f32_kernel ? GGML_TYPE_F32 : f16_type, (ggml_prec)dst->op_params[0]); } // Not implemented GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT const uint32_t kpad = quantize_y ? 0 : ggml_vk_align_size(ne10, ggml_vk_guess_matmul_id_pipeline_align(ctx, mmp, ne01, nei1, qx_needs_dequant ? f16_type : src0->type)); const bool aligned = !quantize_y && ne10 == kpad && ne01 > 8 && nei1 > 8; vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned, qx_needs_dequant ? f16_type : src0->type); if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); } // Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) :ne11; const uint64_t x_ne = ggml_nelements(src0); const uint64_t y_ne = padded_n * ne10 * ne12 * ne13; const uint64_t d_ne = ggml_nelements(dst); const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne; const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); const uint64_t ids_sz = nbi2; const uint64_t d_sz = sizeof(float) * d_ne; vk_pipeline to_fp16_vk_0 = nullptr; vk_pipeline to_fp16_vk_1 = nullptr; vk_pipeline to_q8_1 = nullptr; if (x_non_contig) { to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, f16_type); } else { to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); } if (y_non_contig) { to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, f16_type); } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT if (quantize_y) { to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); } vk_pipeline count_experts = ctx->device->pipeline_count_experts; uint32_t expert_count_size = sizeof(uint32_t) * n_as; { if ( (qx_needs_dequant && x_sz > ctx->device->properties.limits.maxStorageBufferRange) || (qy_needs_dequant && y_sz > ctx->device->properties.limits.maxStorageBufferRange)) { GGML_ABORT("Requested preallocation size is too large"); } if (qx_needs_dequant && ctx->prealloc_size_x < x_sz) { ctx->prealloc_size_x = x_sz; ggml_vk_preallocate_buffers(ctx, subctx); } if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz) { ctx->prealloc_size_y = y_sz; ggml_vk_preallocate_buffers(ctx, subctx); } if (ctx->prealloc_size_split_k < expert_count_size) { ctx->prealloc_size_split_k = expert_count_size; ggml_vk_preallocate_buffers(ctx, subctx); } // Request descriptor sets ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); if (qx_needs_dequant) { ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); } if (qy_needs_dequant) { ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); } if (quantize_y) { ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1); } ggml_pipeline_request_descriptor_sets(ctx, count_experts, 1); } vk_buffer d_D = dst_buf_ctx->dev_buffer; const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; GGML_ASSERT(d_D != nullptr); vk_buffer d_X; uint64_t x_buf_offset = 0; vk_buffer d_Y; uint64_t y_buf_offset = 0; if (!src0_uma) { d_Qx = src0_buf_ctx->dev_buffer; qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); } if (!src1_uma) { d_Qy = src1_buf_ctx->dev_buffer; qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; GGML_ASSERT(d_Qy != nullptr); } if (!ids_uma) { d_ids = ids_buf_ctx->dev_buffer; ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs; GGML_ASSERT(d_ids != nullptr); } if (qx_needs_dequant) { d_X = ctx->prealloc_x; GGML_ASSERT(d_X->size >= x_sz); } else { d_X = d_Qx; x_buf_offset = qx_buf_offset; GGML_ASSERT(qx_sz == x_sz); } if (qy_needs_dequant) { d_Y = ctx->prealloc_y; GGML_ASSERT(d_Y->size >= y_sz); } else if (quantize_y) { d_Y = ctx->prealloc_y; GGML_ASSERT(d_Y->size >= CEIL_DIV(y_sz, 144) * 144); } else { d_Y = d_Qy; y_buf_offset = qy_buf_offset; GGML_ASSERT(qy_sz == y_sz); } if (x_non_contig || qx_needs_dequant) { if (ctx->prealloc_x_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } } // Count how many times each expert is used vk_subbuffer expert_count_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0); if (ctx->prealloc_split_k_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } { const std::vector pc = { (uint32_t)nei0, (uint32_t)nei1, (uint32_t)(nbi0 / ggml_type_size(ids->type)), (uint32_t)(nbi1 / ggml_type_size(ids->type)), (uint32_t)(get_misalign_bytes(ctx, ids) / ggml_type_size(ids->type)) }; ggml_vk_dispatch_pipeline(ctx, subctx, count_experts, { vk_subbuffer{ d_ids, ids_buf_offset, ids_sz }, expert_count_buf }, pc, { (uint32_t)n_as, 1, 1}); } if (x_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0)); } else if (qx_needs_dequant) { const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_X, 0, x_sz } }, pc, { (uint32_t)x_ne, 1, 1}); } if (y_non_contig) { if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || ctx->prealloc_y_last_tensor_used != src1) { if (ctx->prealloc_y_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0)); ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); ctx->prealloc_y_last_tensor_used = src1; } } if (quantize_y) { if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() || ctx->prealloc_y_last_tensor_used != src1) { if (ctx->prealloc_y_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne); ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); ctx->prealloc_y_last_tensor_used = src1; } } ggml_vk_sync_buffers(ctx, subctx); uint32_t stride_batch_x = ne00*ne01; uint32_t stride_batch_y = ne10*ne11; if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); } if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant && !quantize_y) { stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); } // compute ggml_vk_matmul_id( ctx, subctx, pipeline, { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz }, { d_ids, ids_buf_offset, ids_sz }, expert_count_buf, ne01, ne21, ne10, ne10, ne10, ne01, stride_batch_x, stride_batch_y, ne20*ne21, n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11, padded_n ); // NOLINT if (x_non_contig || qx_needs_dequant) { ctx->prealloc_x_need_sync = true; } if (y_non_contig || quantize_y) { ctx->prealloc_y_need_sync = true; } ctx->prealloc_split_k_need_sync = true; } static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { ggml_tensor * dst = cgraph->nodes[node_idx]; ggml_tensor * src0 = dst->src[0]; ggml_tensor * src1 = dst->src[1]; ggml_tensor * ids = dst->src[2]; VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; std::cerr << "))"); GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT GGML_ASSERT(ids->type == GGML_TYPE_I32); const uint64_t ne00 = src0->ne[0]; const uint64_t ne01 = src0->ne[1]; // const uint64_t ne02 = src0->ne[2]; // const uint64_t ne03 = src0->ne[3]; const uint64_t ne10 = src1->ne[0]; const uint64_t ne11 = src1->ne[1]; const uint64_t ne12 = src1->ne[2]; // const uint64_t ne13 = src1->ne[3]; const uint64_t nei0 = ids->ne[0]; const uint64_t nei1 = ids->ne[1]; const uint32_t nbi1 = (uint32_t)(ids->nb[1] / sizeof(int)); const uint64_t ne20 = dst->ne[0]; const uint64_t ne21 = dst->ne[1]; // const uint64_t ne22 = dst->ne[2]; // const uint64_t ne23 = dst->ne[3]; const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); const bool f16_f32_kernel = src1->type == GGML_TYPE_F32; bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && !y_non_contig && (ne11 * ne10) % 4 == 0 && ggml_vk_should_use_mmvq(ctx->device, ne01, ne12, ne10, src0->type); vk_pipeline to_fp16_vk_0 = nullptr; vk_pipeline to_fp16_vk_1 = nullptr; if (x_non_contig) { to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type); } if (y_non_contig) { to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type); } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } // Check for mmq first vk_pipeline dmmv = quantize_y ? ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, GGML_TYPE_Q8_1, ne20, ne00) : nullptr; vk_pipeline to_q8_1 = nullptr; if (dmmv == nullptr) { // Fall back to f16 dequant mul mat dmmv = ggml_vk_get_dequantize_mul_mat_vec_id(ctx, src0->type, src1->type, ne20, ne00); quantize_y = false; } if (quantize_y) { to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); } const bool qx_needs_dequant = x_non_contig; const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig); if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { dmmv = ggml_vk_get_64b_indexing_pipeline(ctx, dmmv); } // Not implemented GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT GGML_ASSERT(dmmv != nullptr); const uint64_t x_ne = ggml_nelements(src0); const uint64_t y_ne = ggml_nelements(src1); const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); { if ( (qx_needs_dequant && x_sz > ctx->device->properties.limits.maxStorageBufferRange) || (qy_needs_dequant && y_sz > ctx->device->properties.limits.maxStorageBufferRange)) { GGML_ABORT("Requested preallocation size is too large"); } if (qx_needs_dequant && ctx->prealloc_size_x < x_sz) { ctx->prealloc_size_x = x_sz; ggml_vk_preallocate_buffers(ctx, subctx); } if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz) { ctx->prealloc_size_y = y_sz; ggml_vk_preallocate_buffers(ctx, subctx); } // Request descriptor sets if (qx_needs_dequant) { ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1); } if (qy_needs_dequant) { ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); } if (quantize_y) { ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1); } ggml_pipeline_request_descriptor_sets(ctx, dmmv, nei1); } vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]); vk_subbuffer d_Qx = ggml_vk_tensor_subbuffer(ctx, src0); vk_subbuffer d_Qy = ggml_vk_tensor_subbuffer(ctx, src1); vk_subbuffer d_ids = ggml_vk_tensor_subbuffer(ctx, ids); vk_subbuffer d_F0 = d_D; vk_subbuffer d_X, d_Y; if (qx_needs_dequant) { d_X = { ctx->prealloc_x, 0, ctx->prealloc_x->size }; } else { d_X = d_Qx; } if (qy_needs_dequant || quantize_y) { d_Y = { ctx->prealloc_y, 0, ctx->prealloc_y->size }; } else { d_Y = d_Qy; } if (x_non_contig) { if (ctx->prealloc_x_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } } if (x_non_contig) { GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, d_Qx, d_X); } if (y_non_contig) { GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || ctx->prealloc_y_last_tensor_used != src1) { if (ctx->prealloc_y_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, d_Qy, d_Y); ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); ctx->prealloc_y_last_tensor_used = src1; } } if (quantize_y) { if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() || ctx->prealloc_y_last_tensor_used != src1) { if (ctx->prealloc_y_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } ggml_vk_quantize_q8_1(ctx, subctx, d_Qy, d_Y, y_ne); ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); ctx->prealloc_y_last_tensor_used = src1; } } uint32_t stride_batch_y = ne10*ne11; if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) { stride_batch_y = src1->nb[2] / ggml_type_size(src1->type); } const uint32_t max_groups_x = ctx->device->properties.limits.maxComputeWorkGroupCount[0]; uint32_t groups_x = ne01; uint32_t groups_z = 1; if (ne01 > max_groups_x) { groups_z = 64; groups_x = CEIL_DIV(groups_x, groups_z); } uint32_t fusion_flags = 0; if (ctx->num_additional_fused_ops > 0) { const ggml_tensor * bias = cgraph->nodes[node_idx + 1]->src[1]; d_F0 = ggml_vk_tensor_subbuffer(ctx, bias); if (cgraph->nodes[node_idx + 1]->op == GGML_OP_MUL) { fusion_flags |= MAT_VEC_FUSION_FLAGS_SCALE0; } else { GGML_ASSERT(cgraph->nodes[node_idx + 1]->op == GGML_OP_ADD_ID); fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS0; } } vk_subbuffer d_F1 = d_D; if (ctx->num_additional_fused_ops > 1) { const ggml_tensor * scale = cgraph->nodes[node_idx + 2]->src[1]; d_F1 = ggml_vk_tensor_subbuffer(ctx, scale); fusion_flags |= MAT_VEC_FUSION_FLAGS_SCALE1; } // Loop over the batch dimension for (uint32_t expert_i1 = 0; expert_i1 < nei1; ++expert_i1) { const vk_mat_vec_id_push_constants pc = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01, (uint32_t)(ne00 * ne01), stride_batch_y, (uint32_t)(ne20 * ne21), fusion_flags, (uint32_t)nei0, (uint32_t)ne11, expert_i1, nbi1 }; ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { d_X, d_Y, d_D, d_F0, d_F1, d_ids, }, pc, { groups_x, (uint32_t)nei0, groups_z }); } if (x_non_contig) { ctx->prealloc_x_need_sync = true; } if (y_non_contig || quantize_y) { ctx->prealloc_y_need_sync = true; } } static bool ggml_vk_use_mul_mat_vec_id(const struct ggml_cgraph * cgraph, int node_idx) { ggml_tensor * dst = cgraph->nodes[node_idx]; ggml_tensor * src0 = dst->src[0]; ggml_tensor * src2 = dst->src[2]; return (src2->ne[1] <= 8) && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)); } static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { ggml_tensor * dst = cgraph->nodes[node_idx]; ggml_tensor * src0 = dst->src[0]; ggml_tensor * src1 = dst->src[1]; ggml_tensor * src2 = dst->src[2]; VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")"); if (ggml_vk_use_mul_mat_vec_id(cgraph, node_idx)) { ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, cgraph, node_idx); } else { ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst); } } static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type kv_type) { GGML_UNUSED(f32acc); // Needs to be kept up to date on shader changes const uint32_t wg_size = params.workgroup_size; const uint32_t Br = params.block_rows; const uint32_t Bc = params.block_cols; const uint32_t float_type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float); const bool mmq = device->integer_dot_product && device->subgroup_clustered && (kv_type == GGML_TYPE_Q4_0 || kv_type == GGML_TYPE_Q4_1 || kv_type == GGML_TYPE_Q5_0 || kv_type == GGML_TYPE_Q5_1 || kv_type == GGML_TYPE_Q8_0 || kv_type == GGML_TYPE_IQ4_NL); // tmpsh is overestimated slightly const uint32_t tmpsh = wg_size * sizeof(float); const uint32_t tmpshv4 = wg_size * 4 * float_type_size; const uint32_t masksh = Bc * (Br + 1) * float_type_size; uint32_t Qf, kvsh, kblocksh_size; if (mmq) { // block_b_cache: int32_t qs[8] + FLOAT_TYPEV2 ds const uint32_t block_b_size = 8 * sizeof(int32_t) + 2 * float_type_size; Qf = Br * (hsk / 32) * block_b_size; // kvsh uses D = HSV (K goes through kblocksh instead) kvsh = params.shmem_staging ? Bc * (hsv / 4 + 1) * 4 * float_type_size : 4 * float_type_size; // block_a_cache size depends on quant type uint32_t block_a_size; switch (kv_type) { case GGML_TYPE_Q4_0: block_a_size = 4 * sizeof(uint32_t) + float_type_size; break; case GGML_TYPE_Q4_1: block_a_size = 4 * sizeof(uint32_t) + 2 * float_type_size; break; case GGML_TYPE_Q5_0: block_a_size = 4 * sizeof(uint32_t) + sizeof(uint32_t) + float_type_size; break; case GGML_TYPE_Q5_1: block_a_size = 4 * sizeof(uint32_t) + sizeof(uint32_t) + 2 * float_type_size; break; case GGML_TYPE_Q8_0: case GGML_TYPE_IQ4_NL: block_a_size = 8 * sizeof(int32_t) + float_type_size; break; default: block_a_size = 0; break; } kblocksh_size = params.shmem_staging ? Bc * (hsk / 32) * block_a_size : block_a_size; } else { Qf = Br * (hsk / 4 + 1) * 4 * float_type_size; const uint32_t D = std::max(hsk, hsv); kvsh = params.shmem_staging ? Bc * (D / 4 + 1) * 4 * float_type_size : 4 * float_type_size; kblocksh_size = 0; } const uint32_t total_size = tmpsh + tmpshv4 + masksh + Qf + kvsh + kblocksh_size; const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize; VK_LOG_DEBUG("ggml_vk_flash_attn_scalar_shmem_support(HSK=" << hsk << ", HSV=" << hsv << ", mmq=" << mmq << ", total_size=" << total_size << ", supported=" << supported); return supported; } static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc) { // Needs to be kept up to date on shader changes const uint32_t Br = params.block_rows; const uint32_t Bc = params.block_cols; const uint32_t MatBr = 16, MatBc = 16; const uint32_t row_split = Bc / MatBc; const uint32_t hsk_pad = ROUNDUP_POW2(hsk, 16); const uint32_t hsv_pad = ROUNDUP_POW2(hsv, 16); const uint32_t acctype = f32acc ? 4 : 2; const uint32_t f16vec4 = 8; const uint32_t tmpsh = (Bc / MatBc) * sizeof(float); const uint32_t qstride = hsk_pad / 4 + 2; const uint32_t Qf = Br * qstride * f16vec4; const uint32_t psh_stride = Br / 4 + 2; const uint32_t Psh = Bc * psh_stride * f16vec4; const uint32_t sfshstride = (hsk <= 128) ? (Br + 8) : Br; const uint32_t sfsh = Bc * sfshstride * acctype; const uint32_t kvshstride = (params.shmem_staging ? std::max(hsk_pad, hsv_pad) : MatBr) / 4 + 2; const uint32_t vsh_stride = MatBc / 4 * row_split; const uint32_t ksh = ((kvshstride >= vsh_stride) ? (Bc * kvshstride) : (Bc * vsh_stride)) * f16vec4; const uint32_t osh_stride = params.row_split * MatBr / 4; const uint32_t pvsh = MatBc * osh_stride * f16vec4; const uint32_t slope = Br * acctype; const uint32_t total_size = tmpsh + Qf + Psh + sfsh + ksh + pvsh + slope; const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize; VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(HSK=" << hsk << ", HSV=" << hsv << ", f32acc=" << f32acc << ", total_size=" << total_size << ", supported=" << supported); return supported; } static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3]; std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3]; std::cerr << "), (" << v << ", name=" << v->name << ", type=" << v->type << ", ne0=" << v->ne[0] << ", ne1=" << v->ne[1] << ", ne2=" << v->ne[2] << ", ne3=" << v->ne[3] << ", nb0=" << v->nb[0] << ", nb1=" << v->nb[1] << ", nb2=" << v->nb[2] << ", nb3=" << v->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; if (sinks) { std::cerr << "), (" << sinks << ", name=" << sinks->name << ", type=" << sinks->type << ", ne0=" << sinks->ne[0] << ", ne1=" << sinks->ne[1] << ", ne2=" << sinks->ne[2] << ", ne3=" << sinks->ne[3] << ", nb0=" << sinks->nb[0] << ", nb1=" << sinks->nb[1] << ", nb2=" << sinks->nb[2] << ", nb3=" << sinks->nb[3]; } std::cerr << "))"); GGML_TENSOR_LOCALS(int64_t, neq, q, ne) GGML_TENSOR_LOCALS(size_t, nbq, q, nb) GGML_TENSOR_LOCALS(int64_t, nek, k, ne) GGML_TENSOR_LOCALS(size_t, nbk, k, nb) GGML_TENSOR_LOCALS(int64_t, nev, v, ne) GGML_TENSOR_LOCALS(size_t, nbv, v, nb) GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) GGML_TENSOR_LOCALS(size_t, nb, dst, nb) const uint32_t nem0 = mask ? mask->ne[0] : 0; const uint32_t nem1 = mask ? mask->ne[1] : 0; const uint32_t nem2 = mask ? mask->ne[2] : 0; const uint32_t nem3 = mask ? mask->ne[3] : 0; const uint32_t HSK = nek0; const uint32_t HSV = nev0; uint32_t N = neq1; const uint32_t KV = nek1; GGML_ASSERT(ne0 == HSV); GGML_ASSERT(ne2 == N); // input tensor rows must be contiguous GGML_ASSERT(nbq0 == ggml_type_size(q->type)); GGML_ASSERT(nbk0 == ggml_type_size(k->type)); GGML_ASSERT(nbv0 == ggml_type_size(v->type)); GGML_ASSERT(neq0 == HSK); GGML_ASSERT(neq1 == N); GGML_ASSERT(nev1 == nek1); // dst cannot be transposed or permuted GGML_ASSERT(nb0 == sizeof(float)); GGML_ASSERT(nb0 <= nb1); GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); assert(dst->type == GGML_TYPE_F32); assert(q->type == GGML_TYPE_F32); uint32_t gqa_ratio = 1; uint32_t qk_ratio = neq2 / nek2; uint32_t workgroups_x = (uint32_t)neq1; uint32_t workgroups_y = (uint32_t)neq2; uint32_t workgroups_z = (uint32_t)neq3; const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32; // For scalar/coopmat1 FA, we can use the "large" size to accommodate qga. // For coopmat2 FA, we always use the small size (which is still pretty large for gqa). vk_fa_tuning_params tuning_params = get_fa_tuning_params(ctx->device, HSK, HSV, 512, KV, k->type, v->type, f32acc); const uint32_t max_gqa = std::min(tuning_params.block_rows, 32u); if (N <= 8 && qk_ratio > 1 && qk_ratio <= max_gqa && qk_ratio * nek2 == neq2 && nek2 == nev2 && nem2 <= 1) { // grouped query attention - make the N dimension equal to gqa_ratio, reduce // workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1 // and change addressing calculations to index Q's dimension 2. gqa_ratio = qk_ratio; N = gqa_ratio; workgroups_y /= gqa_ratio; } tuning_params = get_fa_tuning_params(ctx->device, HSK, HSV, N, KV, k->type, v->type, f32acc); if (tuning_params.path != FA_COOPMAT2) { GGML_ASSERT(k->type == v->type); } const uint32_t q_stride = (uint32_t)(nbq1 / ggml_type_size(q->type)); uint32_t k_stride = (uint32_t)(nbk1 / ggml_type_size(k->type)); uint32_t v_stride = (uint32_t)(nbv1 / ggml_type_size(v->type)); // For F32, the shader treats it as a block of size 4 (for vec4 loads) if (k->type == GGML_TYPE_F32) { k_stride /= 4; } if (v->type == GGML_TYPE_F32) { v_stride /= 4; } const uint32_t alignment = tuning_params.block_cols; bool aligned = (KV % alignment) == 0 && // the "aligned" shader variant will forcibly align strides, for performance (q_stride & 7) == 0 && (k_stride & 7) == 0 && (v_stride & 7) == 0; // Need to use the coopmat2 variant that clamps loads when HSK/HSV aren't sufficiently aligned. if (((HSK | HSV) % 16) != 0 && tuning_params.path == FA_COOPMAT2) { aligned = false; } float scale = 1.0f; float max_bias = 0.0f; float logit_softcap = 0.0f; memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float)); memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float)); memcpy(&logit_softcap, (const float *) dst->op_params + 2, sizeof(float)); if (logit_softcap != 0) { scale /= logit_softcap; } // Only use mask opt when the mask is fairly large. This hasn't been tuned extensively. bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768 && nem0 >= tuning_params.block_cols * 16; vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(ctx->device, tuning_params, HSK, HSV, aligned, f32acc, mask != nullptr, use_mask_opt, logit_softcap != 0, k->type, v->type); vk_pipeline pipeline = nullptr; { std::lock_guard guard(ctx->device->mutex); auto &pipelines = ctx->device->pipeline_flash_attn_f32_f16[k->type]; auto it = pipelines.find(fa_pipeline_state); if (it != pipelines.end()) { pipeline = it->second; } else { pipelines[fa_pipeline_state] = pipeline = std::make_shared(); } } assert(pipeline); // Compile early to initialize wg_denoms. ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); uint32_t split_kv = KV; uint32_t split_k = 1; // Intel Alchemist prefers more workgroups const uint32_t shader_core_count_multiplier = (ctx->device->vendor_id == VK_VENDOR_ID_INTEL && ctx->device->architecture != INTEL_XE2) ? 2 : 1; // Use a placeholder core count if one isn't available. split_k is a big help for perf. const uint32_t shader_core_count = ctx->device->shader_core_count ? ctx->device->shader_core_count * shader_core_count_multiplier : 16; const uint32_t Br = fa_pipeline_state.Br; const uint32_t Bc = fa_pipeline_state.Bc; GGML_ASSERT(Br == pipeline->wg_denoms[0]); const uint32_t Tr = CEIL_DIV(N, Br); // Try to use split_k when KV is large enough to be worth the overhead. if (gqa_ratio > 1 && workgroups_x <= Br) { split_k = shader_core_count * 2 / (workgroups_x * workgroups_y * workgroups_z); } else if (gqa_ratio <= 1) { uint32_t total_wgs_no_split = Tr * workgroups_y * workgroups_z; if (total_wgs_no_split < shader_core_count * 2) { split_k = shader_core_count * 2 / total_wgs_no_split; } } if (split_k > 1) { // Try to evenly split KV into split_k chunks, but it needs to be a multiple // of "align", so recompute split_k based on that. split_kv = ROUNDUP_POW2(std::max(1u, KV / split_k), alignment); split_k = CEIL_DIV(KV, split_kv); } // Reserve space for split_k temporaries. For each split x batch, we need to store the O matrix (D x ne1) // and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows. // For matrices, the order is (inner to outer) [HSV, ne1, k, ne2, ne3]. // For L/M, the order is (inner to outer) [ne1, k, ne2, ne3]. const uint64_t split_k_size = split_k > 1 ? (HSV * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne2 * ne3 : 0; if (split_k_size > ctx->device->properties.limits.maxStorageBufferRange) { GGML_ABORT("Requested preallocation size is too large"); } if (ctx->prealloc_size_split_k < split_k_size) { ctx->prealloc_size_split_k = split_k_size; ggml_vk_preallocate_buffers(ctx, subctx); } const uint32_t mask_opt_num_dwords = CEIL_DIV(nem0, 16 * Bc); const uint64_t mask_opt_size = sizeof(uint32_t) * mask_opt_num_dwords * CEIL_DIV(nem1, Br) * nem2 * nem3; vk_pipeline pipeline_fa_mask_opt = nullptr; if (use_mask_opt) { std::lock_guard guard(ctx->device->mutex); auto &pipelines = ctx->device->pipeline_fa_mask_opt; auto it = pipelines.find({Br, Bc}); if (it != pipelines.end()) { pipeline_fa_mask_opt = it->second; } else { pipelines[{Br, Bc}] = pipeline_fa_mask_opt = std::make_shared(); } assert(pipeline_fa_mask_opt); ggml_pipeline_request_descriptor_sets(ctx, pipeline_fa_mask_opt, 1); if (ctx->prealloc_size_y < mask_opt_size) { ctx->prealloc_size_y = mask_opt_size; ggml_vk_preallocate_buffers(ctx, subctx); } if (ctx->prealloc_y_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } } const uint32_t n_head_kv = neq2; const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); vk_subbuffer q_buf = ggml_vk_tensor_subbuffer(ctx, q); vk_subbuffer k_buf = ggml_vk_tensor_subbuffer(ctx, k); vk_subbuffer v_buf = ggml_vk_tensor_subbuffer(ctx, v); vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); vk_subbuffer mask_buf = mask ? ggml_vk_tensor_subbuffer(ctx, mask) : q_buf; vk_subbuffer sinks_buf = sinks ? ggml_vk_tensor_subbuffer(ctx, sinks) : q_buf; vk_subbuffer mask_opt_buf = use_mask_opt ? ggml_vk_subbuffer(ctx, ctx->prealloc_y, 0) : q_buf; uint32_t mask_n_head_log2 = ((sinks != nullptr) << 24) | n_head_log2; if (use_mask_opt) { const vk_op_flash_attn_mask_opt_push_constants opt_pc = { nem0, nem1, nem2, (uint32_t)(mask->nb[1] / sizeof(ggml_fp16_t)), (uint32_t)(mask->nb[2] / sizeof(ggml_fp16_t)), (uint32_t)(mask->nb[3] / sizeof(ggml_fp16_t)), mask_opt_num_dwords, mask_opt_num_dwords * CEIL_DIV(nem1, Br), mask_opt_num_dwords * CEIL_DIV(nem1, Br) * nem2, }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline_fa_mask_opt, { mask_buf, mask_opt_buf }, opt_pc, { mask_opt_num_dwords, CEIL_DIV(nem1, Br), nem2 * nem3 }); ggml_vk_sync_buffers(ctx, subctx); } const vk_flash_attn_push_constants pc = { N, KV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, (uint32_t)neq2, (uint32_t)neq3, (uint32_t)nek2, (uint32_t)nek3, (uint32_t)nev2, (uint32_t)nev3, nem1, nem2, nem3, q_stride, (uint32_t)nbq2, (uint32_t)nbq3, k_stride, (uint32_t)nbk2, (uint32_t)nbk3, v_stride, (uint32_t)nbv2, (uint32_t)nbv3, scale, max_bias, logit_softcap, mask_n_head_log2, m0, m1, gqa_ratio, split_kv, split_k }; if (split_k > 1) { ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1); if (ctx->prealloc_split_k_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } // We reuse workgroups_x to mean the number of splits, so we need to // cancel out the divide by wg_denoms[0]. uint32_t dispatch_x; if (gqa_ratio > 1) { workgroups_x *= pipeline->wg_denoms[0]; dispatch_x = split_k * workgroups_x; } else { dispatch_x = Tr * split_k * pipeline->wg_denoms[0]; } vk_subbuffer split_k_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {q_buf, k_buf, v_buf, mask_buf, sinks_buf, split_k_buf, mask_opt_buf}, pc, { dispatch_x, workgroups_y, workgroups_z }); ggml_vk_sync_buffers(ctx, subctx); const vk_op_flash_attn_split_k_reduce_push_constants pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, split_k, (sinks != nullptr) }; ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce, {split_k_buf, sinks_buf, dst_buf}, pc2, { (uint32_t)ne1, HSV, (uint32_t)(ne2 * ne3) }); ctx->prealloc_split_k_need_sync = true; } else { if (gqa_ratio > 1) { // When using gqa, we want one actual workgroup per batch, so cancel out wg_denoms workgroups_x *= pipeline->wg_denoms[0]; } ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {q_buf, k_buf, v_buf, mask_buf, sinks_buf, dst_buf, mask_opt_buf}, pc, { workgroups_x, workgroups_y, workgroups_z }); } } static vk_conv_shapes ggml_vk_conv_select_shape(ggml_backend_vk_context * ctx, uint32_t K, uint32_t NPQ) { auto n_tiles = [&](vk_conv_shapes s) { return CEIL_DIV(K, vk_conv_block_sizes[s].K) * CEIL_DIV(NPQ, vk_conv_block_sizes[s].NPQ); }; // We can't query number of shader cores on Intel, use 32 as a placeholder // so small convolutions will still choose a smaller tile. const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32; if (K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) { return CONV_SHAPE_128x128; } else if (K <= 32 && n_tiles(CONV_SHAPE_32x256) >= shader_core_count * 2) { return CONV_SHAPE_32x256; } else { return CONV_SHAPE_64x32; } } static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * dst, ggml_op op) { switch (op) { case GGML_OP_GET_ROWS: GGML_ASSERT(src1->type == GGML_TYPE_I32); if (src0->type == GGML_TYPE_I32) { // i32 src only supports i32 result GGML_ASSERT(dst->type == GGML_TYPE_I32); return ctx->device->pipeline_get_rows[src0->type]; } if (dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_get_rows[src0->type]; } if (dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_get_rows_f32[src0->type]; } return nullptr; case GGML_OP_ACC: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_acc_f32; } return nullptr; case GGML_OP_SET: if (src0->type == src1->type && src0->type == dst->type && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32)) { return ctx->device->pipeline_set_f32; } return nullptr; case GGML_OP_ADD: case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) || (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) || (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16)) { return nullptr; } switch (op) { case GGML_OP_ADD: { if (ctx->num_additional_fused_ops > 0) { if (ctx->do_add_rms_partials) { return ctx->device->pipeline_multi_add_rms[ctx->num_additional_fused_ops]; } else { return ctx->device->pipeline_multi_add[ctx->num_additional_fused_ops]; } } if (ctx->do_add_rms_partials) { auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_rms_norepeat : ctx->device->pipeline_add_rms; return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; } else { auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_norepeat : ctx->device->pipeline_add; return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; } } case GGML_OP_SUB: { auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_sub_norepeat : ctx->device->pipeline_sub; return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; } case GGML_OP_MUL: { auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_mul_norepeat : ctx->device->pipeline_mul; return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; } case GGML_OP_DIV: { auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_div_norepeat : ctx->device->pipeline_div; return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; } default: break; } return nullptr; case GGML_OP_ADD_ID: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && src2->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_add_id_f32; } return nullptr; case GGML_OP_CONCAT: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_concat_f32; } if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_concat_f16; } if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) { return ctx->device->pipeline_concat_i32; } return nullptr; case GGML_OP_UPSCALE: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { uint32_t mode = (ggml_get_op_params_i32(dst, 0) & (0xFF | GGML_SCALE_FLAG_ANTIALIAS)); switch (mode) { case GGML_SCALE_MODE_NEAREST: return ctx->device->pipeline_upscale_nearest_f32; case GGML_SCALE_MODE_BILINEAR: return ctx->device->pipeline_upscale_bilinear_f32; case GGML_SCALE_MODE_BICUBIC: return ctx->device->pipeline_upscale_bicubic_f32; case GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS: return ctx->device->pipeline_upscale_bilinear_antialias_f32; default: return nullptr; } } return nullptr; case GGML_OP_SCALE: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_scale_f32; } return nullptr; case GGML_OP_SQR: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_sqr_f32; } return nullptr; case GGML_OP_SQRT: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_sqrt_f32; } return nullptr; case GGML_OP_SIN: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_sin_f32; } return nullptr; case GGML_OP_COS: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_cos_f32; } return nullptr; case GGML_OP_LOG: if (src0->type == dst->type && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { return ctx->device->pipeline_log[dst->type == GGML_TYPE_F16]; } return nullptr; case GGML_OP_TRI: if (src0->type == dst->type && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { return ctx->device->pipeline_tri[dst->type == GGML_TYPE_F16]; } return nullptr; case GGML_OP_DIAG: if (src0->type == dst->type && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { return ctx->device->pipeline_diag[dst->type == GGML_TYPE_F16]; } return nullptr; case GGML_OP_CLAMP: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_clamp_f32; } return nullptr; case GGML_OP_PAD: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_pad_f32; } return nullptr; case GGML_OP_ROLL: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_roll_f32; } return nullptr; case GGML_OP_REPEAT: if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) { return ctx->device->pipeline_repeat_f32; } return nullptr; case GGML_OP_REPEAT_BACK: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_repeat_back_f32; } return nullptr; case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type); case GGML_OP_SET_ROWS: if (src1->type == GGML_TYPE_I64) { return ctx->device->pipeline_set_rows_i64[dst->type]; } else { return ctx->device->pipeline_set_rows_i32[dst->type]; } case GGML_OP_SILU_BACK: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_silu_back_f32; } return nullptr; case GGML_OP_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_norm_f32; } return nullptr; case GGML_OP_GROUP_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_group_norm_f32; } return nullptr; case GGML_OP_RMS_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { if (ctx->do_add_rms_partials) { return ctx->num_additional_fused_ops > 0 ? ctx->device->pipeline_rms_norm_mul_partials_f32 : ctx->device->pipeline_rms_norm_partials_f32; } else { return ctx->num_additional_fused_ops > 0 ? ctx->device->pipeline_rms_norm_mul_f32 : ctx->device->pipeline_rms_norm_f32; } } return nullptr; case GGML_OP_RMS_NORM_BACK: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_rms_norm_back_f32; } return nullptr; case GGML_OP_L2_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_l2_norm_f32; } return nullptr; case GGML_OP_UNARY: if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) || (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) || (src0->type != dst->type)) { return nullptr; } switch (ggml_get_unary_op(dst)) { case GGML_UNARY_OP_EXP: return ctx->device->pipeline_exp[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_ELU: return ctx->device->pipeline_elu[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_SILU: return ctx->device->pipeline_silu[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_GELU: return ctx->device->pipeline_gelu[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_GELU_ERF: return ctx->device->pipeline_gelu_erf[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_GELU_QUICK: return ctx->device->pipeline_gelu_quick[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_RELU: return ctx->device->pipeline_relu[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_XIELU: return ctx->device->pipeline_xielu[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_NEG: return ctx->device->pipeline_neg[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_TANH: return ctx->device->pipeline_tanh[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_SIGMOID: return ctx->device->pipeline_sigmoid[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_HARDSIGMOID: return ctx->device->pipeline_hardsigmoid[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_HARDSWISH: return ctx->device->pipeline_hardswish[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_ABS: return ctx->device->pipeline_abs[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_SOFTPLUS: return ctx->device->pipeline_softplus[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_STEP: return ctx->device->pipeline_step[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_ROUND: return ctx->device->pipeline_round[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_CEIL: return ctx->device->pipeline_ceil[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_FLOOR: return ctx->device->pipeline_floor[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_TRUNC: return ctx->device->pipeline_trunc[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_SGN: return ctx->device->pipeline_sgn[dst->type == GGML_TYPE_F16]; default: break; } return nullptr; case GGML_OP_GLU: if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) || (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) || (src0->type != dst->type)) { return nullptr; } switch (ggml_get_glu_op(dst)) { case GGML_GLU_OP_GEGLU: return ctx->device->pipeline_geglu[dst->type == GGML_TYPE_F16]; case GGML_GLU_OP_REGLU: return ctx->device->pipeline_reglu[dst->type == GGML_TYPE_F16]; case GGML_GLU_OP_SWIGLU: return ctx->device->pipeline_swiglu[dst->type == GGML_TYPE_F16]; case GGML_GLU_OP_SWIGLU_OAI: return ctx->device->pipeline_swiglu_oai[dst->type == GGML_TYPE_F16]; case GGML_GLU_OP_GEGLU_ERF: return ctx->device->pipeline_geglu_erf[dst->type == GGML_TYPE_F16]; case GGML_GLU_OP_GEGLU_QUICK: return ctx->device->pipeline_geglu_quick[dst->type == GGML_TYPE_F16]; default: break; } return nullptr; case GGML_OP_DIAG_MASK_INF: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_diag_mask_inf_f32; } return nullptr; case GGML_OP_SOFT_MAX: GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); if (ctx->num_additional_fused_ops) { uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0]))); GGML_ASSERT(idx < num_topk_moe_pipelines); // use n_experts from push constant if it's not equal to the power of two spec constant bool use_push = dst->ne[0] != (1u << idx); return ctx->device->pipeline_topk_moe[idx][use_push]; } if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) { return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_wg512 : ctx->device->pipeline_soft_max_f32; } if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_f16_wg512 : ctx->device->pipeline_soft_max_f32_f16; } return nullptr; case GGML_OP_SOFT_MAX_BACK: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_soft_max_back_f32; } return nullptr; case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: { const ggml_tensor *rope = ctx->num_additional_fused_ops == 2 ? dst->src[0]->src[0] : dst; const int mode = ((const int32_t *) rope->op_params)[2]; const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; const bool is_vision = mode == GGML_ROPE_TYPE_VISION; if (is_neox) { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_rope_neox_f32; } if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_rope_neox_f32_f16; } if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_rope_neox_f16; } } else if (is_mrope && !is_vision) { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_rope_multi_f32; } if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_rope_multi_f32_f16; } if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_rope_multi_f16; } } else if (is_vision) { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_rope_vision_f32; } if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_rope_vision_f16; } } else { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_rope_norm_f32; } if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_rope_norm_f32_f16; } if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_rope_norm_f16; } } return nullptr; } case GGML_OP_SUM: case GGML_OP_SUM_ROWS: case GGML_OP_MEAN: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_sum_rows_f32; } return nullptr; case GGML_OP_CUMSUM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { if (src0->ne[0] <= 512) { return ctx->device->pipeline_cumsum_small_f32; } else { return ctx->device->pipeline_cumsum_f32; } } return nullptr; case GGML_OP_SOLVE_TRI: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { vk_solve_tri_pipeline_state solve_tri_pipeline_state(src0->ne[0], src1->ne[0]); vk_pipeline pipeline = nullptr; { std::lock_guard guard(ctx->device->mutex); auto it = ctx->device->pipeline_solve_tri_f32.find(solve_tri_pipeline_state); if (it != ctx->device->pipeline_solve_tri_f32.end()) { pipeline = it->second; } else { ctx->device->pipeline_solve_tri_f32[solve_tri_pipeline_state] = pipeline = std::make_shared(); } } return pipeline; } return nullptr; case GGML_OP_ARGMAX: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_I32) { return ctx->device->pipeline_argmax_f32; } return nullptr; case GGML_OP_COUNT_EQUAL: if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I64) { return ctx->device->pipeline_count_equal_i32; } return nullptr; case GGML_OP_IM2COL: if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_im2col_f32; } if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_im2col_f32_f16; } return nullptr; case GGML_OP_IM2COL_3D: if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_im2col_3d_f32; } if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_im2col_3d_f32_f16; } return nullptr; case GGML_OP_TIMESTEP_EMBEDDING: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_timestep_embedding_f32; } return nullptr; case GGML_OP_CONV_TRANSPOSE_1D: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_conv_transpose_1d_f32; } return nullptr; case GGML_OP_POOL_2D: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_pool2d_f32; } return nullptr; case GGML_OP_RWKV_WKV6: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_rwkv_wkv6_f32; } return nullptr; case GGML_OP_RWKV_WKV7: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_rwkv_wkv7_f32; } return nullptr; case GGML_OP_GATED_DELTA_NET: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { const uint32_t S_v = dst->src[2]->ne[0]; const uint32_t kda = (dst->src[3]->ne[0] == (int64_t)S_v) ? 1 : 0; uint32_t si; switch (S_v) { case 32: si = 0; break; case 64: si = 1; break; case 128: si = 2; break; default: return nullptr; } return ctx->device->pipeline_gated_delta_net[si][kda]; } return nullptr; case GGML_OP_SSM_SCAN: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { const uint32_t d_state = src0->ne[0]; if (d_state == 128) { return ctx->device->pipeline_ssm_scan_f32_d128; } else if (d_state == 256) { return ctx->device->pipeline_ssm_scan_f32_d256; } } return nullptr; case GGML_OP_SSM_CONV: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_ssm_conv_f32; } return nullptr; case GGML_OP_OPT_STEP_ADAMW: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_opt_step_adamw_f32; } return nullptr; case GGML_OP_OPT_STEP_SGD: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_opt_step_sgd_f32; } return nullptr; case GGML_OP_LEAKY_RELU: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_leaky_relu_f32; } return nullptr; case GGML_OP_CONV_2D: case GGML_OP_CONV_TRANSPOSE_2D: if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { uint32_t K = dst->ne[2]; // Cout uint32_t NPQ = dst->ne[3] * dst->ne[1] * dst->ne[0]; // N * OH * OW vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, K, NPQ); bool transpose = dst->op == GGML_OP_CONV_TRANSPOSE_2D; uint32_t KW = (uint32_t)src0->ne[0]; uint32_t KH = (uint32_t)src0->ne[1]; uint32_t s0 = (uint32_t)(ggml_get_op_params_i32(dst, 0)); uint32_t s1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 1) : s0; uint32_t p0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 2) : 0; uint32_t p1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 3) : 0; uint32_t d0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 4) : 1; uint32_t d1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 5) : 1; vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH); std::map *pipelines = nullptr; if (op == GGML_OP_CONV_2D) { if (src0->type == GGML_TYPE_F32) { pipelines = &ctx->device->pipeline_conv2d_f32[shape]; } else if (src0->type == GGML_TYPE_F16) { pipelines = &ctx->device->pipeline_conv2d_f16_f32[shape]; } } else if (op == GGML_OP_CONV_TRANSPOSE_2D) { if (src0->type == GGML_TYPE_F32) { pipelines = &ctx->device->pipeline_conv_transpose_2d_f32[shape]; } else if (src0->type == GGML_TYPE_F16) { pipelines = &ctx->device->pipeline_conv_transpose_2d_f16_f32[shape]; } } vk_pipeline pipeline = nullptr; { std::lock_guard guard(ctx->device->mutex); auto it = pipelines->find(conv2d_pipeline_state); if (it != pipelines->end()) { pipeline = it->second; } else { (*pipelines)[conv2d_pipeline_state] = pipeline = std::make_shared(); } } return pipeline; } return nullptr; case GGML_OP_CONV_2D_DW: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { if (ggml_is_contiguous(src1)) { return ctx->device->pipeline_conv2d_dw_whcn_f32; } else if (ggml_is_contiguous_channels(src1)) { return ctx->device->pipeline_conv2d_dw_cwhn_f32; } } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { if (ggml_is_contiguous(src1)) { return ctx->device->pipeline_conv2d_dw_whcn_f16_f32; } else if (ggml_is_contiguous_channels(src1)) { return ctx->device->pipeline_conv2d_dw_cwhn_f16_f32; } } return nullptr; case GGML_OP_ADD1: if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_add1_f16_f16; } if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_add1_f16_f32; } if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_add1_f32_f32; } return nullptr; case GGML_OP_ARANGE: if (dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_arange_f32; } return nullptr; case GGML_OP_FILL: if (dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_fill_f32; } if (dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_fill_f16; } return nullptr; default: return nullptr; } GGML_UNUSED(src2); } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); p.misalign_offsets = (a_offset << 16) | d_offset; GGML_UNUSED(src1); GGML_UNUSED(src2); GGML_UNUSED(src3); } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_sum_rows_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); p.misalign_offsets = (a_offset << 16) | d_offset; GGML_UNUSED(src1); GGML_UNUSED(src2); GGML_UNUSED(src3); } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_pad_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); p.misalign_offsets = (a_offset << 16) | d_offset; GGML_UNUSED(src1); GGML_UNUSED(src2); GGML_UNUSED(src3); } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_im2col_3d_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { const uint32_t a_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); p.misalign_offsets = (a_offset << 16) | d_offset; GGML_UNUSED(src0); GGML_UNUSED(src2); GGML_UNUSED(src3); } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); GGML_ASSERT(dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0)); p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset; GGML_UNUSED(src2); GGML_UNUSED(src3); } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); p.a_offset = a_offset; p.d_offset = d_offset; GGML_UNUSED(src1); GGML_UNUSED(src2); GGML_UNUSED(src3); } template static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst, ggml_op op, PC&& pc) { VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; if (src1 != nullptr) { std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; } if (src2 != nullptr) { std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3]; } if (src3 != nullptr) { std::cerr << "), (" << src3 << ", name=" << src3->name << ", type=" << src3->type << ", ne0=" << src3->ne[0] << ", ne1=" << src3->ne[1] << ", ne2=" << src3->ne[2] << ", ne3=" << src3->ne[3] << ", nb0=" << src3->nb[0] << ", nb1=" << src3->nb[1] << ", nb2=" << src3->nb[2] << ", nb3=" << src3->nb[3]; } std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; std::cerr << "), " << ggml_op_name(op) << ")"); GGML_ASSERT(op == GGML_OP_GET_ROWS || op == GGML_OP_CPY || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT GGML_ASSERT(dst->buffer != nullptr); const uint64_t ne00 = src0->ne[0]; const uint64_t ne01 = src0->ne[1]; const uint64_t ne02 = src0->ne[2]; const uint64_t ne03 = src0->ne[3]; const bool use_src1 = src1 != nullptr; const uint64_t ne10 = use_src1 ? src1->ne[0] : 0; const uint64_t ne11 = use_src1 ? src1->ne[1] : 0; const uint64_t ne12 = use_src1 ? src1->ne[2] : 0; const uint64_t ne13 = use_src1 ? src1->ne[3] : 0; const bool use_src2 = src2 != nullptr; const bool use_src3 = src3 != nullptr; init_pushconst_fastdiv(pc); vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op); if (pipeline == nullptr) { std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(op) << " for " << ggml_type_name(src0->type); if (src1 != nullptr) { std::cerr << " and " << ggml_type_name(src1->type); } std::cerr << " to " << ggml_type_name(dst->type) << std::endl; GGML_ABORT("fatal error"); } ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); vk_subbuffer src0_buf = ggml_vk_tensor_subbuffer(ctx, src0, true); vk_subbuffer src1_buf = use_src1 ? ggml_vk_tensor_subbuffer(ctx, src1, true) : vk_subbuffer{}; vk_subbuffer src2_buf = use_src2 ? ggml_vk_tensor_subbuffer(ctx, src2, true) : vk_subbuffer{}; vk_subbuffer src3_buf = use_src3 ? ggml_vk_tensor_subbuffer(ctx, src3, true) : vk_subbuffer{}; vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, true); // Compute misalignment offset for descriptors and store it in in push constants. init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, src3, dst); std::array elements; switch (op) { case GGML_OP_NORM: case GGML_OP_RMS_NORM_BACK: case GGML_OP_L2_NORM: case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: case GGML_OP_SUM_ROWS: case GGML_OP_CUMSUM: case GGML_OP_MEAN: case GGML_OP_ARGMAX: { const uint32_t nr = ggml_nrows(src0); if (nr > 262144) { elements = { 512, 512, CEIL_DIV(nr, 262144) }; } else if (nr > 512) { elements = { 512, CEIL_DIV(nr, 512), 1 }; } else { elements = { nr, 1, 1 }; } } break; case GGML_OP_SOLVE_TRI: { uint32_t nr = (uint32_t)(ne02 * ne03); if (nr > 262144) { elements = { 512, 512, CEIL_DIV(nr, 262144) }; } else if (nr > 512) { elements = { 512, CEIL_DIV(nr, 512), 1 }; } else { elements = { nr, 1, 1 }; } } break; case GGML_OP_RMS_NORM: if (ctx->do_add_rms_partials) { // Run one element per thread, 128 threads per workgroup elements = { (uint32_t)CEIL_DIV(ne00, 128), 1, 1 }; } else { elements = { (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne03 }; } break; case GGML_OP_SUM: // We use GGML_OP_SUM_ROWS with 1 row. elements = { 1, 1, 1 }; break; case GGML_OP_GROUP_NORM: { const uint32_t num_groups = dst->op_params[0]; elements = { num_groups * (uint32_t)src0->ne[3], 1, 1 }; } break; case GGML_OP_DIAG_MASK_INF: elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 }; break; case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: { uint32_t nrows = (uint32_t)ggml_nrows(src0); uint32_t z = 1; if (nrows > ctx->device->properties.limits.maxComputeWorkGroupCount[0]) { z = CEIL_DIV(nrows, 32768); nrows = 32768; } elements = { nrows, (uint32_t)ne00, z }; } break; case GGML_OP_GET_ROWS: elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) }; elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]); elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]); break; case GGML_OP_ARGSORT: GGML_ASSERT(0); break; case GGML_OP_IM2COL: { const bool is_2D = dst->op_params[6] == 1; const uint32_t IC = src1->ne[is_2D ? 2 : 1]; const uint32_t KH = is_2D ? src0->ne[1] : 1; const uint32_t KW = src0->ne[0]; const uint32_t OH = is_2D ? dst->ne[2] : 1; const uint32_t OW = dst->ne[1]; const uint32_t batch = src1->ne[is_2D ? 3 : 2]; const uint32_t CHW = IC * KH * KW; // Cap X workgroups to limit concurrent IC channel reads. // The shader loops over X to cover the full CHW dimension. // AMD prefers a lower limit const uint32_t min_cap = ctx->device->vendor_id == VK_VENDOR_ID_AMD ? 512u : 4096u; const uint32_t x_elements = std::min(CHW, std::max(min_cap, OW * KH * KW)); elements = { x_elements, OW, OH * batch }; elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]); elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]); } break; case GGML_OP_IM2COL_3D: { const uint32_t IC = ((const uint32_t *)(dst->op_params))[9]; const uint32_t N = ne13 / IC; const uint32_t KD = ne02; const uint32_t KH = ne01; const uint32_t KW = ne00; const uint32_t OD = dst->ne[3] / N; const uint32_t OH = dst->ne[2]; const uint32_t OW = dst->ne[1]; const uint32_t IC_KD_KH_KW = IC*KD*KH*KW; const uint32_t N_OD_OH = N*OD*OH; elements = { IC_KD_KH_KW, OW, N_OD_OH }; elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]); } break; case GGML_OP_TIMESTEP_EMBEDDING: { const uint32_t dim = dst->op_params[0]; uint32_t half_ceil = (dim + 1) / 2; elements = { half_ceil, (uint32_t)src0->ne[0], 1 }; } break; case GGML_OP_CONV_TRANSPOSE_1D: { elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1} } break; case GGML_OP_POOL_2D: { const uint32_t N = dst->ne[3]; const uint32_t OC = dst->ne[2]; const uint32_t OH = dst->ne[1]; const uint32_t OW = dst->ne[0]; elements = { N * OC * OH * OW, 1, 1}; } break; case GGML_OP_CONV_2D: case GGML_OP_CONV_TRANSPOSE_2D: if constexpr (std::is_same_v) { const uint32_t NPQ = pc.N * pc.OH * pc.OW; const vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, pc.Cout, NPQ); const uint32_t NPQ_blocks = CEIL_DIV(NPQ, vk_conv_block_sizes[shape].NPQ); elements = { pc.Cout, NPQ_blocks, 1 }; if (elements[1] > 512) { elements[2] = CEIL_DIV(elements[1], 512); elements[1] = 512; } } else { GGML_ABORT("invalid push constant type for CONV_2D"); } break; case GGML_OP_ADD: case GGML_OP_SUB: case GGML_OP_DIV: case GGML_OP_MUL: case GGML_OP_ADD1: case GGML_OP_ARANGE: case GGML_OP_FILL: case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_SQRT: case GGML_OP_SIN: case GGML_OP_COS: case GGML_OP_LOG: case GGML_OP_TRI: case GGML_OP_DIAG: case GGML_OP_CLAMP: case GGML_OP_PAD: case GGML_OP_ROLL: case GGML_OP_REPEAT: case GGML_OP_REPEAT_BACK: case GGML_OP_CPY: case GGML_OP_CONCAT: case GGML_OP_UPSCALE: case GGML_OP_UNARY: case GGML_OP_GLU: case GGML_OP_CONV_2D_DW: { uint32_t ne = ggml_nelements(dst); if (op == GGML_OP_CPY && ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) { // Convert from number of logical elements to 2- or 4-byte units. ne /= ggml_blck_size(src0->type); if ((ggml_type_size(src0->type) % 4) == 0) { ne *= ggml_type_size(src0->type) / 4; } else { ne *= ggml_type_size(src0->type) / 2; } } // copy_to_quant has block size of 32, and each thread does QUANT_K elements. // Splitting into 512x512xZ wouldn't work well since each workgroup does 1024 elements. // So divide by block size here before splitting into 512x512 groups. if (op == GGML_OP_CPY && !ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) { ne = CEIL_DIV(ne, ggml_blck_size(dst->type)); } if (ne > 262144) { elements = { 512, 512, CEIL_DIV(ne, 262144) }; } else if (ne > 512) { elements = { 512, CEIL_DIV(ne, 512), 1 }; } else { elements = { ne, 1, 1 }; } if (pipeline == ctx->device->pipeline_cpy_transpose_32 || pipeline == ctx->device->pipeline_cpy_transpose_16) { // 32x32 tiles elements[0] = (uint32_t)CEIL_DIV(dst->ne[0], 32); elements[1] = (uint32_t)CEIL_DIV(dst->ne[1], 32); elements[2] = (uint32_t)(dst->ne[2]*dst->ne[3]); elements[0] = std::min(elements[0], ctx->device->properties.limits.maxComputeWorkGroupCount[0]); elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]); elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]); } } break; case GGML_OP_ADD_ID: { elements = { (uint32_t)ne01, (uint32_t)ne02, 1 }; } break; case GGML_OP_SET_ROWS: { uint32_t ne = ggml_nelements(src0); if (ggml_is_quantized(dst->type)) { // quants run 32 threads each doing QUANT_K elements ne = CEIL_DIV(ne, 32 * ggml_blck_size(dst->type)); } else { // scalar types do one element per thread, running 512 threads ne = CEIL_DIV(ne, 512); } if (ne > 262144) { elements = { 512, 512, CEIL_DIV(ne, 262144) }; } else if (ne > 512) { elements = { 512, CEIL_DIV(ne, 512), 1 }; } else { elements = { ne, 1, 1 }; } } break; case GGML_OP_SSM_CONV: { const uint32_t nr = src0->ne[1]; const uint32_t n_t = dst->ne[1]; const uint32_t n_s = dst->ne[2]; elements = { nr, n_t, n_s }; } break; default: elements = { (uint32_t)ggml_nelements(src0), 1, 1 }; break; } if (op == GGML_OP_ADD || op == GGML_OP_RMS_NORM) { vk_subbuffer a_buf = src0_buf; if (ctx->do_add_rms_partials) { a_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_add_rms_partials, ctx->prealloc_size_add_rms_partials_offset); } ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, dst_buf, a_buf }, pc, elements); } else if (op == GGML_OP_GLU) { // Empty src1 is possible in glu, but the shader needs a buffer vk_subbuffer subbuf1 = use_src1 ? src1_buf : src0_buf; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc, elements); } else if (op == GGML_OP_SOFT_MAX) { // Empty src1 and src2 is possible in soft_max, but the shader needs a buffer vk_subbuffer subbuf1 = use_src1 ? src1_buf : src0_buf; vk_subbuffer subbuf2 = use_src2 ? src2_buf : src0_buf; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, subbuf2, dst_buf }, pc, elements); } else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) { // Empty src2 and src3 is possible in rope, but the shader needs a buffer vk_subbuffer subbuf2 = use_src2 ? src2_buf : src0_buf; vk_subbuffer subbuf3 = use_src3 ? src3_buf : src0_buf; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, subbuf2, dst_buf, subbuf3 }, pc, elements); } else if (op == GGML_OP_IM2COL || op == GGML_OP_IM2COL_3D) { if (ctx->device->shader_int64 && ctx->device->buffer_device_address) { // buffer device address path doesn't use dst buffer dst_buf.size = 1; } // im2col uses only src1 and dst buffers ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src1_buf, dst_buf }, pc, elements); } else if (op == GGML_OP_COUNT_EQUAL) { // count_equal assumes that destination buffer is initialized with zeroes ggml_vk_buffer_memset_async(subctx, dst_buf.buffer, dst_buf.offset, 0, dst_buf.size); ggml_vk_sync_buffers(ctx, subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, dst_buf }, pc, elements); } else if (op == GGML_OP_OPT_STEP_SGD) { // OPT_STEP_SGD works on src0, it does not need dst ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf }, pc, elements); } else if (use_src3) { ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf, src3_buf, dst_buf }, pc, elements); } else if (use_src2) { ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf, dst_buf }, pc, elements); } else if (use_src1) { ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, dst_buf }, pc, elements); } else { ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, dst_buf }, pc, elements); } } static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, }); } static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); int nb1 = dst->op_params[0] / src0_type_size; // 4 bytes of float32 int nb2 = dst->op_params[1] / src0_type_size; // 4 bytes of float32 int nb3 = dst->op_params[2] / src0_type_size; // 4 bytes of float32 int offset = dst->op_params[3] / src0_type_size; // offset in bytes ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, dst->op, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)nb3, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)nb3, 0, 0.0f, 0.0f, offset, }); } static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) { const ggml_tensor *first_node = cgraph->nodes[node_idx]; const ggml_tensor *dst = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; // Make a list of all the tensors used by the op. // Last element of the list is the dest tensor. const ggml_tensor *tensors[MAX_PARAMETER_COUNT]; uint32_t num_srcs = ctx->num_additional_fused_ops + 2; uint32_t num_tensors = num_srcs + 1; GGML_ASSERT(num_tensors + ctx->do_add_rms_partials <= MAX_PARAMETER_COUNT); tensors[0] = first_node->src[0]; tensors[1] = first_node->src[1]; for (int32_t i = 0; i < ctx->num_additional_fused_ops; ++i) { // check whether the previous result is src[0] or src[1] if (cgraph->nodes[node_idx + i] == cgraph->nodes[node_idx + i + 1]->src[0]) { tensors[i+2] = cgraph->nodes[node_idx + i + 1]->src[1]; } else { tensors[i+2] = cgraph->nodes[node_idx + i + 1]->src[0]; } } tensors[num_srcs] = dst; vk_op_multi_add_push_constants pc; pc.ne20 = (uint32_t)dst->ne[0]; pc.ne21 = (uint32_t)dst->ne[1]; pc.ne22 = (uint32_t)dst->ne[2]; pc.ne23 = (uint32_t)dst->ne[3]; for (uint32_t i = 0; i < num_tensors; ++i) { const ggml_tensor *t = tensors[i]; pc.nb[i][0] = (uint32_t)t->nb[0] / sizeof(float); pc.nb[i][1] = (uint32_t)t->nb[1] / sizeof(float); pc.nb[i][2] = (uint32_t)t->nb[2] / sizeof(float); pc.nb[i][3] = (uint32_t)t->nb[3] / sizeof(float); } pc.rms_partials = ctx->do_add_rms_partials; vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, tensors[0], tensors[1], nullptr, dst, dst->op); if (pipeline == nullptr) { std::cerr << "ggml_vulkan: Error: Missing multi_add"; GGML_ABORT("fatal error"); } ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); ggml_backend_vk_buffer_context * buf_ctx[MAX_PARAMETER_COUNT]; vk_buffer buf[MAX_PARAMETER_COUNT]; size_t offset[MAX_PARAMETER_COUNT]; bool uma[MAX_PARAMETER_COUNT]; for (uint32_t i = 0; i < num_tensors; ++i) { buf_ctx[i] = (ggml_backend_vk_buffer_context *)tensors[i]->buffer->context; buf[i] = nullptr; offset[i] = 0; uma[i] = false; if (ctx->device->uma) { ggml_vk_host_get(ctx->device, tensors[i]->data, buf[i], offset[i]); uma[i] = buf[i] != nullptr; } if (!uma[i]) { buf[i] = buf_ctx[i]->dev_buffer; offset[i] = vk_tensor_offset(tensors[i]) + tensors[i]->view_offs; } GGML_ASSERT(buf[i] != nullptr); } // If any remaining descriptors are unused, just point them at src[0] for (uint32_t i = num_tensors; i < MAX_PARAMETER_COUNT; ++i) { buf[i] = buf[0]; offset[i] = 0; } if (ctx->do_add_rms_partials) { buf[num_tensors] = ctx->prealloc_add_rms_partials; offset[num_tensors] = ctx->prealloc_size_add_rms_partials_offset; } std::array elements; uint32_t ne = ggml_nelements(dst); if (ne > 262144) { elements = { 512, 512, CEIL_DIV(ne, 262144) }; } else if (ne > 512) { elements = { 512, CEIL_DIV(ne, 512), 1 }; } else { elements = { ne, 1, 1 }; } static_assert(MAX_PARAMETER_COUNT == 12); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { ggml_vk_subbuffer(ctx, buf[0], offset[0]), ggml_vk_subbuffer(ctx, buf[1], offset[1]), ggml_vk_subbuffer(ctx, buf[2], offset[2]), ggml_vk_subbuffer(ctx, buf[3], offset[3]), ggml_vk_subbuffer(ctx, buf[4], offset[4]), ggml_vk_subbuffer(ctx, buf[5], offset[5]), ggml_vk_subbuffer(ctx, buf[6], offset[6]), ggml_vk_subbuffer(ctx, buf[7], offset[7]), ggml_vk_subbuffer(ctx, buf[8], offset[8]), ggml_vk_subbuffer(ctx, buf[9], offset[9]), ggml_vk_subbuffer(ctx, buf[10], offset[10]), ggml_vk_subbuffer(ctx, buf[11], offset[11]), }, pc, elements); } static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_ADD, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, ctx->do_add_rms_partials, }); } static void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SUB, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, }); } static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_MUL, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, }); } static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_DIV, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, }); } static void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t src2_type_size = ggml_type_size(src2->type); ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_ADD_ID, { (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src2->nb[1] / src2_type_size, }); } static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, int version) { GGML_ASSERT(version == 6 || version == 7); int num_srcs = version == 6 ? 6 : 7; for (int i = 0; i < num_srcs; i++) { GGML_ASSERT(!ggml_is_quantized(dst->src[i]->type)); } GGML_ASSERT(dst->buffer != nullptr); vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, dst->src[0], dst->src[1], dst->src[2], dst, dst->op); GGML_ASSERT(pipeline != nullptr); ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); vk_subbuffer src_buf[7] = {}; for (int i = 0; i < num_srcs; i++) { src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]); } std::array elements = { (uint32_t)(pc.B * pc.H), 1, 1 }; if (version == 6) { ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], dst_buf}, pc, elements); } else if (version == 7) { ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], src_buf[6], dst_buf}, pc, elements); } else { // shouldn't happen GGML_ASSERT(false); } } static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { const size_t seq_length = dst->src[0]->ne[2]; const size_t n_embed = dst->ne[0]; const size_t n_heads = dst->src[0]->ne[1]; const size_t n_seqs = dst->src[5]->ne[1]; ggml_vk_op_f32_wkv( ctx, subctx, dst, { (uint32_t)n_seqs, (uint32_t)seq_length, (uint32_t)n_embed, (uint32_t)n_heads, }, 6 ); } static void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { const size_t seq_length = dst->src[0]->ne[2]; const size_t n_embed = dst->ne[0]; const size_t n_heads = dst->src[0]->ne[1]; const size_t n_seqs = dst->src[6]->ne[1]; ggml_vk_op_f32_wkv( ctx, subctx, dst, { (uint32_t)n_seqs, (uint32_t)seq_length, (uint32_t)n_embed, (uint32_t)n_heads, }, 7 ); } static void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { const ggml_tensor * src_q = dst->src[0]; const ggml_tensor * src_v = dst->src[2]; const ggml_tensor * src_beta = dst->src[4]; GGML_ASSERT(dst->buffer != nullptr); const uint32_t S_v = (uint32_t)src_v->ne[0]; const uint32_t H = (uint32_t)src_v->ne[1]; const uint32_t n_tokens = (uint32_t)src_v->ne[2]; const uint32_t n_seqs = (uint32_t)src_v->ne[3]; const uint32_t s_off = S_v * H * n_tokens * n_seqs; vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, dst->src[0], dst->src[1], dst->src[2], dst, dst->op); GGML_ASSERT(pipeline != nullptr); ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); vk_subbuffer src_buf[6] = {}; for (int i = 0; i < 6; i++) { src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]); } const uint32_t sq1 = (uint32_t)(src_q->nb[1] / sizeof(float)); const uint32_t sq2 = (uint32_t)(src_q->nb[2] / sizeof(float)); const uint32_t sq3 = (uint32_t)(src_q->nb[3] / sizeof(float)); const uint32_t sv1 = (uint32_t)(src_v->nb[1] / sizeof(float)); const uint32_t sv2 = (uint32_t)(src_v->nb[2] / sizeof(float)); const uint32_t sv3 = (uint32_t)(src_v->nb[3] / sizeof(float)); const uint32_t sb1 = (uint32_t)(src_beta->nb[1] / sizeof(float)); const uint32_t sb2 = (uint32_t)(src_beta->nb[2] / sizeof(float)); const uint32_t sb3 = (uint32_t)(src_beta->nb[3] / sizeof(float)); const uint32_t neq1 = (uint32_t)src_q->ne[1]; const uint32_t rq3 = (uint32_t)(src_v->ne[3] / src_q->ne[3]); const float scale = 1.0f / sqrtf((float)S_v); const vk_op_gated_delta_net_push_constants pc = { H, n_tokens, n_seqs, s_off, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3, neq1, rq3, scale }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], dst_buf}, pc, { H, n_seqs, S_v }); } static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src2 = dst->src[2]; const ggml_tensor * src3 = dst->src[3]; const ggml_tensor * src4 = dst->src[4]; const ggml_tensor * src5 = dst->src[5]; GGML_ASSERT(dst->buffer != nullptr); const uint32_t head_dim = src0->ne[1]; const uint32_t n_head = src1->ne[1]; const uint32_t n_group = src4->ne[1]; const uint32_t n_tok = src1->ne[2]; const uint32_t n_seq = src1->ne[3]; bool is_mamba2 = (src3->nb[1] == sizeof(float)); GGML_ASSERT(is_mamba2); vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, dst->op); GGML_ASSERT(pipeline != nullptr); ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); const int64_t s_off = ggml_nelements(src1) * sizeof(float); const vk_op_ssm_scan_push_constants pc = { (uint32_t)src0->nb[2], (uint32_t)src0->nb[3], (uint32_t)src1->nb[2], (uint32_t)src1->nb[3], (uint32_t)src2->nb[1], (uint32_t)src2->nb[2], (uint32_t)src3->nb[1], (uint32_t)src4->nb[2], (uint32_t)src4->nb[3], (uint32_t)src5->nb[2], (uint32_t)src5->nb[3], (uint32_t)s_off, n_head, head_dim, n_group, n_tok }; vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); vk_subbuffer src_buf[7] = {}; for (int i = 0; i < 7 && dst->src[i] != nullptr; i++) { src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]); } std::array elements; const uint32_t d_state = src0->ne[0]; uint32_t num_subgroups = d_state / ctx->device->subgroup_size; const uint32_t num_workgroups_x = CEIL_DIV(n_head * head_dim, num_subgroups); const uint32_t num_workgroups_y = n_seq; elements = { num_workgroups_x, num_workgroups_y, 1 }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], src_buf[6], dst_buf}, pc, elements); } static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SSM_CONV, { (uint32_t)src0->nb[1], (uint32_t)src0->nb[2], (uint32_t)src1->nb[1], (uint32_t)dst->nb[0], (uint32_t)dst->nb[1], (uint32_t)dst->nb[2], (uint32_t)src1->ne[0], (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2], }); } static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_push_constants&& pc) { const ggml_tensor * x = dst->src[0]; const ggml_tensor * g = dst->src[1]; const ggml_tensor * gm = dst->src[2]; const ggml_tensor * gv = dst->src[3]; const ggml_tensor * p = dst->src[4]; GGML_ASSERT(x->type == GGML_TYPE_F32); GGML_ASSERT(g->type == GGML_TYPE_F32); GGML_ASSERT(gm->type == GGML_TYPE_F32); GGML_ASSERT(gv->type == GGML_TYPE_F32); GGML_ASSERT(p->type == GGML_TYPE_F32); GGML_ASSERT(dst->buffer != nullptr); GGML_ASSERT(ggml_is_contiguous(x)); GGML_ASSERT(ggml_is_contiguous(g)); GGML_ASSERT(ggml_is_contiguous(gm)); GGML_ASSERT(ggml_is_contiguous(gv)); GGML_ASSERT(ggml_is_contiguous(p)); GGML_ASSERT(ggml_are_same_shape(x, g)); GGML_ASSERT(ggml_are_same_shape(x, gm)); GGML_ASSERT(ggml_are_same_shape(x, gv)); GGML_ASSERT(ggml_nelements(p) == 7); vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, g, gm, gv, dst, GGML_OP_OPT_STEP_ADAMW); GGML_ASSERT(pipeline != nullptr); ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); vk_subbuffer x_buf = ggml_vk_tensor_subbuffer(ctx, x); vk_subbuffer g_buf = ggml_vk_tensor_subbuffer(ctx, g); vk_subbuffer gm_buf = ggml_vk_tensor_subbuffer(ctx, gm); vk_subbuffer gv_buf = ggml_vk_tensor_subbuffer(ctx, gv); vk_subbuffer p_buf = ggml_vk_tensor_subbuffer(ctx, p); std::array elements = { (uint32_t)ggml_nelements(x), 1, 1 }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {x_buf, g_buf, gm_buf, gv_buf, p_buf}, pc, elements); } static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { const size_t n = ggml_nelements(dst->src[0]); ggml_vk_op_f32_opt_step_adamw( ctx, subctx, dst, { (uint32_t)n, 0, 0.0f, 0.0f, 0.0f, 0.0f } ); } static void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { const size_t n = ggml_nelements(dst->src[0]); ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f, 0.0f, 0.0f }); } static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { int * op_params = (int *)dst->op_params; const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONCAT, { (uint32_t)ggml_nelements(dst), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, op_params[0], }); } static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0); GGML_TENSOR_UNARY_OP_LOCALS float sf0 = (float)ne0 / ne00; float sf1 = (float)ne1 / ne01; float sf2 = (float)ne2 / ne02; float sf3 = (float)ne3 / ne03; float pixel_offset = 0.5f; if (mode & GGML_SCALE_FLAG_ALIGN_CORNERS) { sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0; sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1; pixel_offset = 0.0f; } ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UPSCALE, { (uint32_t)ggml_nelements(dst), 0, 0, (uint32_t)ne00, (uint32_t)ne01, (uint32_t)nb00 / src0_type_size, (uint32_t)nb01 / src0_type_size, (uint32_t)nb02 / src0_type_size, (uint32_t)nb03 / src0_type_size, (uint32_t)ne0, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, sf0, sf1, sf2, sf3, pixel_offset }); } static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); p.param1 = ggml_get_op_params_f32(dst, 0); p.param2 = ggml_get_op_params_f32(dst, 1); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p)); } static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst)); } static void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQRT, vk_op_unary_push_constants_init(src0, dst)); } static void ggml_vk_add1(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_ADD1, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, }); } static void ggml_vk_arange(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_arange(dst=" << dst << ", ne=" << ggml_nelements(dst) << ")"); vk_op_push_constants pc = { (uint32_t)ggml_nelements(dst), 1, ggml_get_op_params_f32(dst, 0), ggml_get_op_params_f32(dst, 2), 0.0f, 0.0f, }; vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, dst, GGML_OP_ARANGE); GGML_ASSERT(pipeline != nullptr); ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, false); std::array elements = { (uint32_t)ggml_nelements(dst), 1, 1 }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { dst_buf }, pc, elements); } static void ggml_vk_fill(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_fill(dst=" << dst << ", ne=" << ggml_nelements(dst) << ")"); vk_op_push_constants pc = { (uint32_t)ggml_nelements(dst), 1, ggml_get_op_params_f32(dst, 0), 0.0f, 0.0f, 0.0f, }; vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, dst, GGML_OP_FILL); GGML_ASSERT(pipeline != nullptr); ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, false); std::array elements = { (uint32_t)ggml_nelements(dst), 1, 1 }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { dst_buf }, pc, elements); } static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst)); } static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst)); } static void ggml_vk_log(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LOG, vk_op_unary_push_constants_init(src0, dst)); } static void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); p.param1 = ggml_get_op_params_f32(dst, 0); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TRI, std::move(p)); } static void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG, std::move(p)); } static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); p.param1 = ggml_get_op_params_f32(dst, 0); p.param2 = ggml_get_op_params_f32(dst, 1); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p)); } static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_pad_push_constants p = vk_op_pad_push_constants_init(src0, dst); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p)); } static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { const int32_t s0 = ggml_get_op_params_i32(dst, 0); const int32_t s1 = ggml_get_op_params_i32(dst, 1); const int32_t s2 = ggml_get_op_params_i32(dst, 2); const int32_t s3 = ggml_get_op_params_i32(dst, 3); const uint32_t s01_packed = ((s0 + 0x8000) << 16) | (s1 + 0x8000); const uint32_t s23_packed = ((s2 + 0x8000) << 16) | (s3 + 0x8000); vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); memcpy(&p.param1, &s01_packed, sizeof(float)); memcpy(&p.param2, &s23_packed, sizeof(float)); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p)); } static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p)); } static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p)); } static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { uint32_t ne = (uint32_t)ggml_nelements(src0); if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) { // Convert from number of logical elements to 2- or 4-byte units. ne /= ggml_blck_size(src0->type); if ((ggml_type_size(src0->type) % 4) == 0) { ne *= ggml_type_size(src0->type) / 4; } else { ne *= ggml_type_size(src0->type) / 2; } } vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p)); } static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); // Skip empty skip_rows operations. For most ops the empty check at the start // of ggml_vk_build_graph is sufficient, but set_rows can have a nonempty dst // with empty srcs. if (ggml_is_empty(src0) || ggml_is_empty(src1)) { return; } ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, }); } static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f }); } static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f, 0.0f, 0.0f }); } static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { const int * int_op_params = (const int *)dst->op_params; const float * float_op_params = (const float *)dst->op_params; const uint32_t num_groups = int_op_params[0]; const float eps = float_op_params[1]; const uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f, 0.0f, 0.0f }); } static uint32_t ggml_vk_rms_num_partials(ggml_backend_vk_context * ctx, const ggml_tensor *node) { const uint32_t ne = (uint32_t)node->ne[0]; const uint32_t denom = ctx->device->pipeline_add_rms[0][0][0]->wg_denoms[0]; const uint32_t num_partials = CEIL_DIV(ne, denom); return num_partials; } static uint32_t ggml_vk_rms_partials_size(ggml_backend_vk_context * ctx, const ggml_tensor *node) { const uint32_t num_partials = ggml_vk_rms_num_partials(ctx, node); const uint32_t num_bytes = ROUNDUP_POW2(num_partials * sizeof(uint32_t), ctx->device->partials_binding_alignment); return num_bytes; } static vk_op_rope_push_constants ggml_vk_make_rope_constants(const ggml_tensor *dst, const ggml_tensor *src0, const bool has_ff, bool backprop, const uint32_t set_rows_stride) { const int n_dims = ((const int32_t *) dst->op_params)[1]; const int mode = ((const int32_t *) dst->op_params)[2]; // const int n_ctx = ((const int32_t *) dst->op_params)[3]; const int n_ctx_orig = ((const int32_t *) dst->op_params)[4]; const float freq_base = ((const float *) dst->op_params)[5]; const float freq_scale = ((const float *) dst->op_params)[6]; const float ext_factor = ((const float *) dst->op_params)[7]; const float attn_factor = ((const float *) dst->op_params)[8]; const float beta_fast = ((const float *) dst->op_params)[9]; const float beta_slow = ((const float *) dst->op_params)[10]; int sections[4] {}; if (mode & GGML_ROPE_TYPE_MROPE) { memcpy(sections, (const int32_t *) dst->op_params + 11, sizeof(int)*4); } const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; float corr_dims[2]; ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); const float theta_scale = powf(freq_base, -2.0f/n_dims); uint32_t nb01 = src0->nb[1] / ggml_type_size(src0->type); uint32_t nb02 = src0->nb[2] / ggml_type_size(src0->type); uint32_t nb03 = src0->nb[3] / ggml_type_size(src0->type); uint32_t nb11 = dst->nb[1] / ggml_type_size(dst->type); uint32_t nb12 = dst->nb[2] / ggml_type_size(dst->type); uint32_t nb13 = dst->nb[3] / ggml_type_size(dst->type); vk_op_rope_push_constants rope { (uint32_t)mode, (uint32_t)ggml_nrows(src0), (uint32_t)n_dims, freq_scale, freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale, has_ff, { sections[0], sections[1], sections[2], sections[3] }, is_imrope, backprop, set_rows_stride, (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], nb01, nb02, nb03, nb11, nb12, nb13, }; return rope; } static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, float * op_params) { ggml_tensor * dst; const ggml_tensor * src0; const ggml_tensor * src1; if (ctx->num_additional_fused_ops > 0) { // fused rms_norm + mul ggml_tensor *mul = cgraph->nodes[node_idx + 1]; ggml_tensor *other_src = mul->src[0] == cgraph->nodes[node_idx + 0] ? mul->src[1] : mul->src[0]; dst = mul; src0 = cgraph->nodes[node_idx]->src[0]; src1 = other_src; } else { dst = cgraph->nodes[node_idx]; src0 = src1 = dst->src[0]; } const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); uint32_t param3 = ctx->do_add_rms_partials ? ggml_vk_rms_num_partials(ctx, dst) : 0; vk_op_binary_push_constants bin { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, op_params[0], 0.0f, (int32_t)param3, }; // more than one fused op means rms_norm+mul+rope if (ctx->num_additional_fused_ops > 1) { static constexpr uint32_t max_tensors = 7; const ggml_tensor *tensors[max_tensors] {}; ggml_tensor *rms = cgraph->nodes[node_idx + 0]; ggml_tensor *mul = cgraph->nodes[node_idx + 1]; ggml_tensor *rope = cgraph->nodes[node_idx + 2]; ggml_tensor *other_src = mul->src[0] == rms ? mul->src[1] : mul->src[0]; bool do_set_rows = ctx->num_additional_fused_ops == 4; tensors[0] = rms->src[0]; tensors[1] = other_src; tensors[2] = mul; tensors[3] = rope->src[1]; // pos tensors[4] = rope->src[2]; // ff tensors[5] = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; // dst tensors[6] = do_set_rows ? tensors[5]->src[1] : nullptr; const uint32_t set_rows_stride = do_set_rows ? tensors[5]->nb[1] / ggml_type_size(tensors[5]->type) : 0; vk_op_rms_norm_mul_rope_push_constants pc; pc.bin = bin; pc.rope = ggml_vk_make_rope_constants(rope, rope->src[0], tensors[4] != nullptr, false, set_rows_stride); vk_pipeline pipeline = tensors[5]->type == GGML_TYPE_F16 ? ctx->device->pipeline_rms_norm_mul_rope_f32_f16 : ctx->device->pipeline_rms_norm_mul_rope_f32_f32; ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); ggml_backend_vk_buffer_context * buf_ctx[max_tensors]; vk_buffer buf[max_tensors]; size_t offset[max_tensors]; bool uma[max_tensors]; for (uint32_t i = 0; i < max_tensors; ++i) { if (!tensors[i]) { // If any remaining descriptors are unused, just point them at src[0] buf[i] = buf[0]; offset[i] = 0; continue; } buf_ctx[i] = (ggml_backend_vk_buffer_context *)tensors[i]->buffer->context; buf[i] = nullptr; offset[i] = 0; uma[i] = false; if (ctx->device->uma) { ggml_vk_host_get(ctx->device, tensors[i]->data, buf[i], offset[i]); uma[i] = buf[i] != nullptr; } if (!uma[i]) { buf[i] = buf_ctx[i]->dev_buffer; offset[i] = vk_tensor_offset(tensors[i]) + tensors[i]->view_offs; } GGML_ASSERT(buf[i] != nullptr); } std::array elements; elements = { (uint32_t)rms->src[0]->ne[1], (uint32_t)rms->src[0]->ne[2], (uint32_t)rms->src[0]->ne[3] }; static_assert(max_tensors == 7); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { ggml_vk_subbuffer(ctx, buf[0], offset[0]), ggml_vk_subbuffer(ctx, buf[1], offset[1]), ggml_vk_subbuffer(ctx, buf[2], offset[2]), ggml_vk_subbuffer(ctx, buf[3], offset[3]), ggml_vk_subbuffer(ctx, buf[4], offset[4]), ggml_vk_subbuffer(ctx, buf[5], offset[5]), ggml_vk_subbuffer(ctx, buf[6], offset[6]), }, pc, elements); } else { ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM, std::move(bin)); } if (ctx->do_add_rms_partials_offset_calculation) { ctx->prealloc_size_add_rms_partials_offset += ggml_vk_rms_partials_size(ctx, src0); ctx->do_add_rms_partials = false; ctx->do_add_rms_partials_offset_calculation = false; } } static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f, 0.0f, 0.0f }); } static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { const float * op_params = (const float *)dst->op_params; vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); p.param1 = op_params[0]; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_L2_NORM, std::move(p)); } static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f }); } static void ggml_vk_xielu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, op_params[1], op_params[2], op_params[3], op_params[4] } ); } static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const float * op_params_f = (const float *)dst->op_params; const bool swapped = (bool)dst->op_params[1]; const bool split = src1 != nullptr; const float alpha = op_params_f[2]; const float limit = op_params_f[3]; if (!split) { GGML_ASSERT(src0->ne[0] / 2 == dst->ne[0]); } else { GGML_ASSERT(src0->ne[0] == src1->ne[0]); GGML_ASSERT(src0->ne[0] == dst->ne[0]); GGML_ASSERT(src0->type == src1->type); } const uint32_t mode = split ? 2 : (swapped ? 1 : 0); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GLU, { (uint32_t)ggml_nelements(dst), (uint32_t)src0->ne[0], (uint32_t)dst->ne[0], mode, alpha, limit, (uint32_t)(src0->nb[1] / src0->nb[0]), (uint32_t)(src0->nb[2] / src0->nb[0]), (uint32_t)(src0->nb[3] / src0->nb[0]), (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)(dst->nb[1] / dst->nb[0]), (uint32_t)(dst->nb[2] / dst->nb[0]), (uint32_t)(dst->nb[3] / dst->nb[0]), (uint32_t)dst->ne[1], (uint32_t)dst->ne[2] }); } static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { int32_t * op_params = (int32_t *)dst->op_params; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }); } static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; float scale = op_params[0]; float max_bias = op_params[1]; const uint32_t ncols = (uint32_t)src0->ne[0]; const uint32_t nrows_x = (uint32_t)ggml_nrows(src0); const uint32_t nrows_y = (uint32_t)src0->ne[1]; const uint32_t ne12 = src1 ? (uint32_t)(src1->ne[2]) : 0u; const uint32_t ne13 = src1 ? (uint32_t)(src1->ne[3]) : 0u; const uint32_t nb11 = src1 ? (uint32_t)(src1->nb[1] / src1->nb[0]) : 0u; const uint32_t nb12 = src1 ? (uint32_t)(src1->nb[2] / src1->nb[0]) : 0u; const uint32_t nb13 = src1 ? (uint32_t)(src1->nb[3] / src1->nb[0]) : 0u; const uint32_t n_head_kv = src0->ne[2]; const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); vk_op_soft_max_push_constants pc { ncols, src1 != nullptr ? nrows_y : (uint32_t)0, (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], ne12, ne13, nb11, nb12, nb13, scale, max_bias, m0, m1, n_head_log2, nrows_x, src2 != nullptr }; if (ncols <= 16384) { ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, std::move(pc)); } else { vk_subbuffer buf_a = ggml_vk_tensor_subbuffer(ctx, src0); vk_subbuffer buf_b = src1 ? ggml_vk_tensor_subbuffer(ctx, src1) : buf_a; vk_subbuffer buf_c = src2 ? ggml_vk_tensor_subbuffer(ctx, src2) : buf_a; vk_subbuffer buf_d = ggml_vk_tensor_subbuffer(ctx, dst); uint32_t elems_per_wg = 128 * 4; uint32_t num_wgs = CEIL_DIV(ncols, elems_per_wg); size_t tmp_size = num_wgs * nrows_x * sizeof(float); if (ctx->prealloc_size_x < tmp_size) { ctx->prealloc_size_x = tmp_size; ggml_vk_preallocate_buffers(ctx, subctx); } if (ctx->prealloc_size_y < tmp_size) { ctx->prealloc_size_y = tmp_size; ggml_vk_preallocate_buffers(ctx, subctx); } if (ctx->prealloc_x_need_sync || ctx->prealloc_y_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } vk_subbuffer buf_x = { ctx->prealloc_x, 0, tmp_size }; vk_subbuffer buf_y = { ctx->prealloc_y, 0, tmp_size }; std::array elements = { num_wgs, nrows_x, 1 }; vk_pipeline pipeline1 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large1_f32_f16 : ctx->device->pipeline_soft_max_large1_f32; vk_pipeline pipeline2 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large2_f32_f16 : ctx->device->pipeline_soft_max_large2_f32; vk_pipeline pipeline3 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large3_f32_f16 : ctx->device->pipeline_soft_max_large3_f32; ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1); ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1); ggml_pipeline_request_descriptor_sets(ctx, pipeline3, 1); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements); ggml_vk_sync_buffers(ctx, subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements); ggml_vk_sync_buffers(ctx, subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline3, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements); ctx->prealloc_x_need_sync = true; ctx->prealloc_y_need_sync = true; } } static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1], 0.0f, 0.0f }); } static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) { topk_moe_mode mode = ctx->fused_topk_moe_mode; ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0]; ggml_tensor * bias = (mode == TOPK_MOE_SIGMOID_NORM_BIAS) ? cgraph->nodes[node_idx + 2]->src[1] : logits; ggml_tensor * weights = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; ggml_tensor * ids = (mode == TOPK_MOE_SIGMOID_NORM_BIAS) ? cgraph->nodes[node_idx + 4] : (mode == TOPK_MOE_LATE_SOFTMAX) ? cgraph->nodes[node_idx + 1] : cgraph->nodes[node_idx + 3]; GGML_ASSERT(logits->type == GGML_TYPE_F32); GGML_ASSERT(bias->type == GGML_TYPE_F32); GGML_ASSERT(weights->type == GGML_TYPE_F32); GGML_ASSERT(ids->type == GGML_TYPE_I32); const int n_experts = logits->ne[0]; const int n_rows = logits->ne[1]; const int n_expert_used = weights->ne[1]; GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts); vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, cgraph->nodes[node_idx], GGML_OP_SOFT_MAX); ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); vk_subbuffer logits_buf = ggml_vk_tensor_subbuffer(ctx, logits); vk_subbuffer bias_buf = ggml_vk_tensor_subbuffer(ctx, bias); vk_subbuffer weights_buf = ggml_vk_tensor_subbuffer(ctx, weights); vk_subbuffer ids_buf = ggml_vk_tensor_subbuffer(ctx, ids); vk_op_topk_moe_push_constants pc {}; pc.n_rows = n_rows; pc.n_experts_push = n_experts; pc.n_expert_used = n_expert_used; pc.clamp_min = -std::numeric_limits::infinity(); pc.clamp_max = std::numeric_limits::infinity(); if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) { ggml_tensor * clamp = cgraph->nodes[node_idx + 7]; GGML_ASSERT(clamp->op == GGML_OP_CLAMP); pc.clamp_min = ggml_get_op_params_f32(clamp, 0); pc.clamp_max = ggml_get_op_params_f32(clamp, 1); } if (mode == TOPK_MOE_SIGMOID_NORM_BIAS) { ggml_tensor * clamp = cgraph->nodes[node_idx + 8]; GGML_ASSERT(clamp->op == GGML_OP_CLAMP); pc.clamp_min = ggml_get_op_params_f32(clamp, 0); pc.clamp_max = ggml_get_op_params_f32(clamp, 1); } #define GATING_FUNC_SOFTMAX 0 #define GATING_FUNC_SIGMOID 1 #define GATING_FUNC_SOFTMAX_WEIGHT 2 pc.gating_func = mode == TOPK_MOE_SIGMOID_NORM_BIAS ? GATING_FUNC_SIGMOID : mode == TOPK_MOE_LATE_SOFTMAX ? GATING_FUNC_SOFTMAX_WEIGHT : GATING_FUNC_SOFTMAX; pc.has_bias = mode == TOPK_MOE_SIGMOID_NORM_BIAS; pc.with_norm = mode == TOPK_MOE_EARLY_SOFTMAX_NORM || mode == TOPK_MOE_SIGMOID_NORM_BIAS; if (ctx->fused_topk_moe_scale) { GGML_ASSERT(weights->op == GGML_OP_SCALE); pc.output_scale = ggml_get_op_params_f32(weights, 0); pc.output_bias = ggml_get_op_params_f32(weights, 1); } else { pc.output_scale = 1.0f; pc.output_bias = 0.0f; } GGML_ASSERT(n_expert_used <= n_experts); const uint32_t rows_per_block = 4; std::array elements = { CEIL_DIV(n_rows, rows_per_block), 1, 1 }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {logits_buf, bias_buf, weights_buf, ids_buf}, pc, elements); } static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop) { ggml_tensor * dst = cgraph->nodes[node_idx]; const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src2 = dst->src[2]; const ggml_tensor * src3 = nullptr; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; // const int n_ctx = ((int32_t *) dst->op_params)[3]; const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; const float freq_base = ((float *) dst->op_params)[5]; const float beta_fast = ((float *) dst->op_params)[9]; const float beta_slow = ((float *) dst->op_params)[10]; int sections[4] {}; if (mode & GGML_ROPE_TYPE_MROPE) { memcpy(sections, (int32_t *) dst->op_params + 11, sizeof(int)*4); } float corr_dims[2]; ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); uint32_t set_rows_stride = 0; // Fused rope + view + set_rows passes the set_rows destination stride in set_rows_stride // and overrides the dst and sets src3=row_indices if (ctx->num_additional_fused_ops > 0) { set_rows_stride = cgraph->nodes[node_idx + 2]->nb[1] / ggml_type_size(cgraph->nodes[node_idx + 2]->type); src3 = cgraph->nodes[node_idx + 2]->src[1]; dst = cgraph->nodes[node_idx + 2]; } ggml_vk_op_f32(ctx, subctx, src0, src1, src2, src3, dst, GGML_OP_ROPE, ggml_vk_make_rope_constants(cgraph->nodes[node_idx], src0, src2 != nullptr, backprop, set_rows_stride)); } static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { const uint32_t * op_params = (const uint32_t *)dst->op_params; uint32_t ncols = src0->ne[0]; uint32_t nrows = ggml_nrows(src0); uint32_t ncols_pad_log2 = (uint32_t)ceilf(log2f(float(ncols))); uint32_t ncolsp2 = 1 << ncols_pad_log2; vk_op_argsort_push_constants pc { ncols, ncolsp2, ncols_pad_log2, nrows, op_params[0], 0, 0, 0, 0, }; // Pick the largest workgroup size <= ncolsp2 uint32_t pipeline_idx = std::min(ncols_pad_log2, num_argsort_pipelines - 1); // Use the "small" argsort shader if the whole sort can be done by a single workgroup. bool use_small = ncols_pad_log2 <= ctx->device->max_workgroup_size_log2 && ctx->device->pipeline_argsort_f32[pipeline_idx] != nullptr; vk_pipeline pipeline = use_small ? ctx->device->pipeline_argsort_f32[pipeline_idx] : ctx->device->pipeline_argsort_large_f32[pipeline_idx]; vk_subbuffer src0_buf = ggml_vk_tensor_subbuffer(ctx, src0); vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); vk_subbuffer subbuf1 = dst_buf; // Reserve space for ivec2 per element, with rows padded to a power of two if (!use_small) { const size_t x_sz = size_t{ncolsp2} * nrows * 2 * sizeof(int); if (ctx->prealloc_size_x < x_sz) { ctx->prealloc_size_x = x_sz; ggml_vk_preallocate_buffers(ctx, subctx); } if (ctx->prealloc_x_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } subbuf1 = { ctx->prealloc_x, 0, ctx->prealloc_x->size }; } std::array elements; elements[0] = ncolsp2; elements[1] = std::min((uint32_t)ggml_nrows(src0), ctx->device->properties.limits.maxComputeWorkGroupCount[1]); elements[2] = 1; // First dispatch initializes tmp_idx and does the first N passes where // there is only communication between threads in the same workgroup. { vk_op_argsort_push_constants pc2 = pc; pc2.outer_start = 0; pc2.outer_end = std::min(ncols_pad_log2, ctx->device->max_workgroup_size_log2); pc2.inner_start = 0; pc2.inner_end = 100; ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc2, elements); } if (!use_small) { ggml_vk_sync_buffers(ctx, subctx); // Loop over outer/inner passes, synchronizing between each pass. for (uint32_t outer = ctx->device->max_workgroup_size_log2; outer < ncols_pad_log2; ++outer) { for (uint32_t inner = 0; inner < outer + 1; ++inner) { vk_op_argsort_push_constants pc2 = pc; pc2.outer_start = outer; pc2.outer_end = outer + 1; pc2.inner_start = inner; pc2.inner_end = inner + 1; // When the inner idx is large enough, there's only communication // within a workgroup. So the remaining inner iterations can all // run in the same dispatch. if (outer - inner < pipeline_idx) { pc2.inner_end = 100; inner = outer; pipeline = ctx->device->pipeline_argsort_large_f32[pipeline_idx]; } else { // Smaller workgroup empirically seems to perform better pipeline = ctx->device->pipeline_argsort_large_f32[pipeline_idx - 2]; } ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc2, elements); ggml_vk_sync_buffers(ctx, subctx); } } ctx->prealloc_x_need_sync = true; } } static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { uint32_t ncols = src0->ne[0]; uint32_t nrows = ggml_nrows(src0); uint32_t k = dst->ne[0]; vk_op_topk_push_constants pc { ncols, ncols, ncols, k, nrows, 0, 0 }; if (ctx->prealloc_x_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } std::array elements; elements[1] = std::min(nrows, ctx->device->properties.limits.maxComputeWorkGroupCount[1]); elements[2] = 1; uint32_t num_elements = ncols; // Each iteration reduces a workgroup's worth of elements down to the K // largest elements. Repeat until we have the top K elements. // Need to do at least one iteration to write out the results. bool done_one_iter = false; uint32_t dbl_buf_index = 0; size_t dbl_buf_size; while (num_elements > k || !done_one_iter) { // Prefer going as small as num_topk_pipelines - 3 for perf reasons. // But if K is larger, then we need a larger workgroup uint32_t max_pipeline = num_topk_pipelines - 1; uint32_t preferred_pipeline = std::max(num_topk_pipelines - 3, (uint32_t)log2f(float(k)) + 2); max_pipeline = std::min(preferred_pipeline, max_pipeline); uint32_t min_pipeline = (uint32_t)log2f(float(k)) + 1; // require full subgroup min_pipeline = std::max(min_pipeline, ctx->device->subgroup_size_log2); uint32_t pipeline_idx = (uint32_t)ceilf(log2f(float(num_elements))); pipeline_idx = std::min(pipeline_idx, max_pipeline); pipeline_idx = std::max(pipeline_idx, min_pipeline); if (num_elements > (1u << pipeline_idx)) { // If we could finish on this loop iteration (i.e. a single workgroup) // then do so. It's better than the overhead of another pass. for (uint32_t i = pipeline_idx; i < num_topk_pipelines; ++i) { if (num_elements <= (1u << i)) { pipeline_idx = i; break; } } } vk_pipeline pipeline = ctx->device->pipeline_topk_f32[pipeline_idx]; // If the device doesn't support a pipeline this large, use smaller while (!pipeline) { pipeline_idx--; GGML_ASSERT(pipeline_idx >= min_pipeline); pipeline = ctx->device->pipeline_topk_f32[pipeline_idx]; } vk_op_topk_push_constants pc2 = pc; pc2.ncols_input = num_elements; // Number of elements remaining after this pass uint32_t num_dst_elements = (num_elements / pipeline->wg_denoms[0]) * k + std::min(k, num_elements % pipeline->wg_denoms[0]); pc2.ncols_output = num_dst_elements; if (!done_one_iter) { // Reserve space for ivec2 per element, double buffered // K per workgroup per row dbl_buf_size = num_dst_elements * nrows * 2 * sizeof(int); dbl_buf_size = ROUNDUP_POW2(dbl_buf_size, ctx->device->properties.limits.minStorageBufferOffsetAlignment); const size_t x_sz = dbl_buf_size * 2; if (ctx->prealloc_size_x < x_sz) { ctx->prealloc_size_x = x_sz; ggml_vk_preallocate_buffers(ctx, subctx); } } vk_subbuffer src_buf; vk_subbuffer dst_buf; if (num_elements == ncols) { pc2.first_pass = 1; src_buf = ggml_vk_tensor_subbuffer(ctx, src0); } else { src_buf = { ctx->prealloc_x, dbl_buf_index * dbl_buf_size, dbl_buf_size }; } if (num_dst_elements == k) { pc2.last_pass = 1; dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); } else { dst_buf = { ctx->prealloc_x, (dbl_buf_index ^ 1) * dbl_buf_size, dbl_buf_size }; } elements[0] = num_elements; ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src_buf, dst_buf }, pc2, elements); num_elements = num_dst_elements; dbl_buf_index ^= 1; if (num_elements > k) { ggml_vk_sync_buffers(ctx, subctx); } done_one_iter = true; } ctx->prealloc_x_need_sync = true; } static void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, ggml_nelements(src0)); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM, p); } static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, p); } static void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]); p.weight = 1.0f / (float)src0->ne[0]; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_MEAN, p); } static void ggml_vk_cumsum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_sum_rows_push_constants pc = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]); // Use the single pass shader when the rows are small or there are enough rows to fill the GPU. // For fewer, larger rows, use the multipass shader to spread each row across SMs. if (dst->ne[0] <= 4096 || ggml_nrows(dst) >= ctx->device->shader_core_count) { ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CUMSUM, pc); return; } // First pass computes partial sums within a block, and stores the last partial // to the temp buffer. Second pass sums the block partials from the temp buffer // and adds that to the result of the first pass. vk_pipeline pipeline1 = ctx->device->pipeline_cumsum_multipass1_f32; vk_pipeline pipeline2 = ctx->device->pipeline_cumsum_multipass2_f32; GGML_ASSERT(pipeline1 != nullptr && pipeline2 != nullptr); ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1); ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1); std::array elements; elements[0] = dst->ne[0]; elements[1] = (uint32_t)ggml_nrows(dst); elements[2] = 1; size_t temp_size = sizeof(float) * elements[0] * ggml_nrows(dst); if (ctx->prealloc_size_split_k < temp_size) { ctx->prealloc_size_split_k = temp_size; ggml_vk_preallocate_buffers(ctx, subctx); } vk_subbuffer src_buf = ggml_vk_tensor_subbuffer(ctx, src0); vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); vk_subbuffer temp_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0); if (ctx->prealloc_split_k_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, {src_buf, dst_buf, temp_buf}, pc, elements); ggml_vk_sync_buffers(ctx, subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, {src_buf, dst_buf, temp_buf}, pc, elements); ctx->prealloc_split_k_need_sync = true; } static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f, 0.0f, 0.0f }); } static void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_COUNT_EQUAL, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f }); } static void ggml_vk_solve_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOLVE_TRI, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, }); } static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const int32_t s0 = dst->op_params[0]; const int32_t s1 = dst->op_params[1]; const int32_t p0 = dst->op_params[2]; const int32_t p1 = dst->op_params[3]; const int32_t d0 = dst->op_params[4]; const int32_t d1 = dst->op_params[5]; const bool is_2D = dst->op_params[6] == 1; const uint32_t IC = src1->ne[is_2D ? 2 : 1]; const uint32_t IH = is_2D ? src1->ne[1] : 1; const uint32_t IW = src1->ne[0]; const uint32_t KH = is_2D ? src0->ne[1] : 1; const uint32_t KW = src0->ne[0]; const uint32_t OH = is_2D ? dst->ne[2] : 1; const uint32_t OW = dst->ne[1]; const uint32_t offset_delta = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 const uint32_t batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32 const uint32_t batch = src1->ne[is_2D ? 3 : 2]; const ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; const vk_buffer d_buf = d_buf_ctx->dev_buffer; const vk::DeviceAddress dst_addr = d_buf->bda_addr + vk_tensor_offset(dst) + dst->view_offs; ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL, { dst_addr, batch_offset, offset_delta, IC, IW, IH, OW, OH, KW, KH, OH * batch, IC * KH * KW, s0, s1, p0, p1, d0, d1, batch * IC }); } static void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_TENSOR_BINARY_OP_LOCALS const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; const int32_t s2 = ((const int32_t *)(dst->op_params))[2]; const int32_t p0 = ((const int32_t *)(dst->op_params))[3]; const int32_t p1 = ((const int32_t *)(dst->op_params))[4]; const int32_t p2 = ((const int32_t *)(dst->op_params))[5]; const int32_t d0 = ((const int32_t *)(dst->op_params))[6]; const int32_t d1 = ((const int32_t *)(dst->op_params))[7]; const int32_t d2 = ((const int32_t *)(dst->op_params))[8]; const int32_t IC = ((const int32_t *)(dst->op_params))[9]; const int64_t N = ne13 / IC; const int64_t ID = ne12; const int64_t IH = ne11; const int64_t IW = ne10; const int64_t KD = ne02; const int64_t KH = ne01; const int64_t KW = ne00; const int64_t OD = ne3 / N; const int64_t OH = ne2; const int64_t OW = ne1; const ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; const vk_buffer d_buf = d_buf_ctx->dev_buffer; const vk::DeviceAddress dst_addr = d_buf->bda_addr + vk_tensor_offset(dst) + dst->view_offs; vk_op_im2col_3d_push_constants pc {}; pc.dst_addr = dst_addr; pc.nb10 = nb10 / ggml_type_size(src1->type); pc.nb11 = nb11 / ggml_type_size(src1->type); pc.nb12 = nb12 / ggml_type_size(src1->type); pc.nb13 = nb13 / ggml_type_size(src1->type); pc.s0 = s0; pc.s1 = s1; pc.s2 = s2; pc.p0 = p0; pc.p1 = p1; pc.p2 = p2; pc.d0 = d0; pc.d1 = d1; pc.d2 = d2; pc.IW = IW; pc.IH = IH; pc.ID = ID; pc.IC = IC; pc.KW = KW; pc.OH = OH; pc.KD_KH_KW = KD*KH*KW; pc.KH_KW = KH*KW; pc.IC_KD_KH_KW = IC*KD*KH*KW; pc.N_OD_OH = N*OD*OH; pc.OD_OH = OD*OH; pc.OD_OH_OW_IC_KD_KH_KW = OD*OH*OW*IC*KD*KH*KW; pc.OH_OW_IC_KD_KH_KW = OH*OW*IC*KD*KH*KW; pc.OW_IC_KD_KH_KW = OW*IC*KD*KH*KW; ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL_3D, std::move(pc)); } static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { const uint32_t dim = dst->op_params[0]; const uint32_t max_period = dst->op_params[1]; const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, { nb1, dim, max_period, }); } static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { // src0: (K, Cout, Cin, 1) -- kernel // src1: (L, Cin, 1, 1) -- input // dst: (*, Cout, 1, 1) GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_TENSOR_BINARY_OP_LOCALS GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float)); const int32_t s0 = dst->op_params[0]; vk_op_conv_transpose_1d_push_constants p{}; p.Cout = static_cast(ne01); p.Cin = static_cast(ne02); p.K = static_cast(ne00); p.L = static_cast(ne10); p.KL = static_cast(ne0); p.nb01 = static_cast(nb01 / nb00); p.nb02 = static_cast(nb02 / nb00); p.nb11 = static_cast(nb11 / nb10); p.nb1 = static_cast(nb1 / nb0); p.s0 = static_cast(s0); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p)); } static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { uint32_t op = static_cast(dst->op_params[0]); const int32_t k1 = dst->op_params[1]; const int32_t k0 = dst->op_params[2]; const int32_t s1 = dst->op_params[3]; const int32_t s0 = dst->op_params[4]; const int32_t p1 = dst->op_params[5]; const int32_t p0 = dst->op_params[6]; const uint32_t IH = src0->ne[1]; const uint32_t IW = src0->ne[0]; const uint32_t N = dst->ne[3]; const uint32_t OC = dst->ne[2]; const uint32_t OH = dst->ne[1]; const uint32_t OW = dst->ne[0]; const uint32_t parallel_elements = N * OC * OH * OW; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_POOL_2D, { IW, IH, OW, OH, OC, parallel_elements, op, k0, k1, s0, s1, p0, p1, }); } static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_TENSOR_BINARY_OP_LOCALS GGML_ASSERT(nb00 == sizeof(float) || nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); GGML_ASSERT(nb0 == sizeof(float)); bool transpose = dst->op == GGML_OP_CONV_TRANSPOSE_2D; vk_op_conv2d_push_constants p{}; p.Cout = static_cast(!transpose ? ne03 : ne02); p.Cin = static_cast(!transpose ? ne02 : ne03); p.N = static_cast(ne13); GGML_ASSERT(p.Cout == ne2); GGML_ASSERT(p.Cin == ne12); p.W = static_cast(ne10); p.H = static_cast(ne11); p.OW = static_cast(ne0); p.OH = static_cast(ne1); p.nb01 = static_cast(nb01 / nb00); p.nb02 = static_cast(nb02 / nb00); p.nb03 = static_cast(nb03 / nb00); p.nb11 = static_cast(nb11 / nb10); p.nb12 = static_cast(nb12 / nb10); p.nb13 = static_cast(nb13 / nb10); p.nb1 = static_cast(nb1 / nb0); p.nb2 = static_cast(nb2 / nb0); p.nb3 = static_cast(nb3 / nb0); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, dst->op, std::move(p)); } static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { vk_op_conv2d_dw_push_constants p{}; p.ne = ggml_nelements(dst); p.channels = dst->ne[2]; p.batches = dst->ne[3]; p.dst_w = dst->ne[0]; p.dst_h = dst->ne[1]; p.src_w = src1->ne[0]; p.src_h = src1->ne[1]; p.knl_w = src0->ne[0]; p.knl_h = src0->ne[1]; p.stride_x = dst->op_params[0]; p.stride_y = dst->op_params[1]; p.pad_x = dst->op_params[2]; p.pad_y = dst->op_params[3]; p.dilation_x = dst->op_params[4]; p.dilation_y = dst->op_params[5]; GGML_ASSERT(src0->ne[3] == p.channels); GGML_ASSERT(src1->ne[3] == p.batches); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D_DW, std::move(p)); } static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { const float * op_params = (const float *)dst->op_params; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f, 0.0f, 0.0f }); } #ifdef GGML_VULKAN_RUN_TESTS static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) { if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) { return; } i0 = std::max(i0, 5); i1 = std::max(i1, 5); i2 = std::max(i2, 0); fprintf(stderr, " "); for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { fprintf(stderr, "%7d ", idx1); } fprintf(stderr, "\n"); for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) { fprintf(stderr, "%7d: ", idx0); for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { if (idx0 >= 0 && idx0 < ne0 && idx1 >= 0 && idx1 < ne1) { float val; if (type == GGML_TYPE_F32) { val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0); } else if (type == GGML_TYPE_F16) { val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0)); } else { GGML_ABORT("fatal error"); } fprintf(stderr, "% 7.2f ", val); } else { fprintf(stderr, " "); } } fprintf(stderr, "\n"); } } template static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) { VK_LOG_DEBUG("ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")"); const size_t x_ne = m * k * batch; const size_t y_ne = k * n * batch; const size_t d_ne = m * n * batch; vk_pipeline p; std::string shname; if (shader_size == 0) { if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f32->a_s; shname = "F32_ALIGNED_S"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f32_f16->a_s; shname = "F32_F16_ALIGNED_S"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f16_f32.f32acc->a_s; shname = "F16_F32_ALIGNED_S"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f16.f32acc->a_s; shname = "F16_ALIGNED_S"; } else { GGML_ABORT("fatal error"); } } else if (shader_size == 1) { if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f32->a_m; shname = "F32_ALIGNED_M"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f32_f16->a_m; shname = "F32_F16_ALIGNED_M"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f16_f32.f32acc->a_m; shname = "F16_F32_ALIGNED_M"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f16.f32acc->a_m; shname = "F16_ALIGNED_M"; } else { GGML_ABORT("fatal error"); } } else if (shader_size == 2) { if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f32->a_l; shname = "F32_ALIGNED_L"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f32_f16->a_l; shname = "F32_F16_ALIGNED_L"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f16_f32.f32acc->a_l; shname = "F16_F32_ALIGNED_L"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f16.f32acc->a_l; shname = "F16_ALIGNED_L"; } else { GGML_ABORT("fatal error"); } } else { GGML_ASSERT(0); } const size_t kpad = ggml_vk_align_size(k, p->align); if (k != kpad) { if (shader_size == 0) { if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f32->s; shname = "F32_S"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f32_f16->s; shname = "F32_F16_S"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f16_f32.f32acc->s; shname = "F16_F32_S"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f16.f32acc->s; shname = "F16_S"; } } else if (shader_size == 1) { if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f32->m; shname = "F32_M"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f32_f16->m; shname = "F32_F16_M"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f16_f32.f32acc->m; shname = "F16_F32_M"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f16.f32acc->m; shname = "F16_M"; } } else if (shader_size == 2) { if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f32->l; shname = "F32_L"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f32_f16->l; shname = "F32_F16_L"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f16_f32.f32acc->l; shname = "F16_F32_L"; } else if (std::is_same() && std::is_same()) { p = ctx->device->pipeline_matmul_f16.f32acc->l; shname = "F16_L"; } } } if (split_k > 1) { ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it); if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { // Resize buffer if (ctx->prealloc_split_k != nullptr) { ggml_vk_destroy_buffer(ctx->prealloc_split_k); } ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, {vk::MemoryPropertyFlagBits::eDeviceLocal}); } } ggml_pipeline_allocate_descriptor_sets(ctx); vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, {vk::MemoryPropertyFlagBits::eDeviceLocal}); vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, {vk::MemoryPropertyFlagBits::eDeviceLocal}); vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, {vk::MemoryPropertyFlagBits::eDeviceLocal}); X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne); Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne); float* d = (float *) malloc(sizeof(float) * d_ne); for (size_t i = 0; i < x_ne; i++) { if (std::is_same()) { x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; // x[i] = 1.0f; // x[i] = i + 1; // x[i] = (i % k == i / k) ? 1.0f : 0.0f; } else if (std::is_same()) { x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); // x[i] = ggml_fp32_to_fp16(1.0f); // x[i] = ggml_fp32_to_fp16(i + 1); // x[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); } else { GGML_ABORT("fatal error"); } } for (size_t i = 0; i < y_ne; i++) { if (std::is_same()) { y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; // y[i] = (i % k == i / k) ? 1.0f : 0.0f; // y[i] = i + 1; } else if (std::is_same()) { y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); // y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); // y[i] = ggml_fp32_to_fp16(i + 1); } else { GGML_ABORT("fatal error"); } } ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch); ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); ggml_vk_ctx_begin(ctx->device, subctx); for (size_t i = 0; i < num_it; i++) { ggml_vk_matmul( ctx, subctx, p, ggml_vk_subbuffer(ctx, d_X), ggml_vk_subbuffer(ctx, d_Y), ggml_vk_subbuffer(ctx, d_D), ggml_vk_subbuffer(ctx, ctx->prealloc_split_k), m, n, k, k, k, m, k*m, k*n, m*n, split_k, batch, batch, batch, 1, 1, n ); } ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); ggml_vk_submit(subctx, ctx->fence); VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences"); ctx->device->device.resetFences({ ctx->fence }); ggml_vk_queue_command_pools_cleanup(ctx->device); auto end = std::chrono::high_resolution_clock::now(); double time = std::chrono::duration_cast(end-begin).count() / 1000.0; // copy dst to host ggml_vk_buffer_read(d_D, 0, d, sizeof(float) * d_ne); float * d_chk = (float *) malloc(sizeof(float) * d_ne); ggml_init_params iparams = { /*.mem_size =*/ 1024*1024*1024, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; ggml_context * ggml_ctx = ggml_init(iparams); ggml_type src0_type; ggml_type src1_type; if (std::is_same()) { src0_type = GGML_TYPE_F32; } else if (std::is_same()) { src0_type = GGML_TYPE_F16; } else { GGML_ABORT("fatal error"); } if (std::is_same()) { src1_type = GGML_TYPE_F32; } else if (std::is_same()) { src1_type = GGML_TYPE_F16; } else { GGML_ABORT("fatal error"); } ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch); ggml_tensor * src1_ggml = ggml_new_tensor_3d(ggml_ctx, src1_type, k, n, batch); ggml_tensor * tensor_ggml = ggml_mul_mat(ggml_ctx, src0_ggml, src1_ggml); src0_ggml->data = x; src1_ggml->data = y; tensor_ggml->data = d_chk; ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); ggml_build_forward_expand(cgraph, tensor_ggml); ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1); ggml_free(ggml_ctx); double avg_err = 0.0; int first_err_n = -1; int first_err_m = -1; int first_err_b = -1; for (size_t i = 0; i < m*n*batch; i++) { double err = std::fabs(d[i] - d_chk[i]); avg_err += err; if ((err > 0.05f || std::isnan(err)) && first_err_n == -1) { first_err_b = i / (m * n); first_err_n = (i % (m * n)) / m; first_err_m = (i % (m * n)) % m; } } avg_err /= m * n; double tflops = 2.0*m*n*k*batch*num_it / (time / 1000.0) / (1000.0*1000.0*1000.0*1000.0); std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl; if (avg_err > 0.1 || std::isnan(avg_err)) { std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl; std::cerr << "Actual result: " << std::endl << std::endl; ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); std::cerr << "Expected result: " << std::endl << std::endl; ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); if (split_k > 1) { float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k); ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k); std::cerr << "d_buf0: " << std::endl << std::endl; ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); std::cerr << "d_buf1: " << std::endl << std::endl; ggml_vk_print_matrix_area(split_k_buf + d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); std::cerr << "d_buf2: " << std::endl << std::endl; ggml_vk_print_matrix_area(split_k_buf + 2 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); std::cerr << "d_buf3: " << std::endl << std::endl; ggml_vk_print_matrix_area(split_k_buf + 3 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); free(split_k_buf); } } free(d_chk); ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool); ggml_vk_destroy_buffer(d_X); ggml_vk_destroy_buffer(d_Y); ggml_vk_destroy_buffer(d_D); free(x); free(y); free(d); } static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1, int i2, int i3) { if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) { return; } i0 = std::max(i0, 5); i1 = std::max(i1, 5); i2 = std::max(i2, 0); i3 = std::max(i3, 0); fprintf(stderr, " "); for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { fprintf(stderr, "%7d ", idx1); } fprintf(stderr, "\n"); for (int idx0 = i0 - 5; idx0 < i0 + 5; idx0++) { fprintf(stderr, "%7d: ", idx0); for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) { float val; if (tensor->type == GGML_TYPE_F32) { val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); } else if (tensor->type == GGML_TYPE_F16) { val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0])); } else { GGML_ABORT("fatal error"); } fprintf(stderr, "% 7.2f ", val); } else { fprintf(stderr, " "); } } fprintf(stderr, "\n"); } } static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) { ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr); } static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) { if (quant == GGML_TYPE_F32) { memcpy(to, from, sizeof(float) * ne); return; } const auto * tt = ggml_get_type_traits(quant); ggml_to_float_t dequant_fn = tt->to_float; dequant_fn(from, to, ne); } static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) { VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")"); const size_t x_sz = sizeof(float) * ne; const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne; const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant); float * x = (float *) malloc(x_sz); void * qx = malloc(qx_sz); vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz_f16, {vk::MemoryPropertyFlagBits::eDeviceLocal}); float * x_ref = (float *) malloc(x_sz); ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16); for (size_t i = 0; i < ne; i++) { x[i] = rand() / (float)RAND_MAX; } vk_pipeline p = ggml_vk_get_to_fp16(ctx, quant); ggml_vk_quantize_data(x, qx, ne, quant); ggml_vk_dequantize_data(qx, x_ref, ne, quant); ggml_pipeline_request_descriptor_sets(ctx, p, 1); ggml_pipeline_allocate_descriptor_sets(ctx); ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); ggml_vk_ctx_begin(ctx->device, subctx); const std::vector pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne }; ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1}); ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); ggml_vk_submit(subctx, ctx->fence); VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences"); ctx->device->device.resetFences({ ctx->fence }); ggml_vk_queue_command_pools_cleanup(ctx->device); auto end = std::chrono::high_resolution_clock::now(); double ms_dequant = std::chrono::duration_cast(end-begin).count() / 1000.0; ggml_vk_buffer_read(x_buf, 0, x_chk, x_sz_f16); int first_err = -1; double avg_err = 0.0; for (size_t i = 0; i < ne; i++) { double error = std::fabs(x_ref[i] - ggml_fp16_to_fp32(x_chk[i])); avg_err += error; if (first_err < 0 && error > 0.05) { first_err = i; } } avg_err /= ne; std::cerr << "TEST DEQUANT " << ggml_type_name(quant) << " time=" << ms_dequant << "ms avg_err=" << avg_err << std::endl; if (avg_err > 0.1) { std::cerr << "first_error = " << first_err << std::endl; std::cerr << "Actual result: " << std::endl << std::endl; for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) { std::cerr << ggml_fp16_to_fp32(x_chk[i]) << ", "; } std::cerr << std::endl << "Expected result: " << std::endl << std::endl; for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) { std::cerr << x_ref[i] << ", "; } std::cerr << std::endl; } ggml_vk_destroy_buffer(x_buf); ggml_vk_destroy_buffer(qx_buf); free(x); free(qx); free(x_ref); free(x_chk); } // This does not work without ggml q8_1 quantization support // // typedef uint16_t ggml_half; // typedef uint32_t ggml_half2; // // #define QK8_1 32 // typedef struct { // union { // struct { // ggml_half d; // delta // ggml_half s; // d * sum(qs[i]) // } GGML_COMMON_AGGR_S; // ggml_half2 ds; // } GGML_COMMON_AGGR_U; // int8_t qs[QK8_1]; // quants // } block_q8_1; // // static void ggml_vk_test_quantize(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) { // VK_LOG_DEBUG("ggml_vk_test_quantize(" << ne << ")"); // GGML_ASSERT(quant == GGML_TYPE_Q8_1); // // const size_t x_sz = sizeof(float) * ne; // const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant); // float * x = (float *) malloc(x_sz); // block_q8_1 * qx = (block_q8_1 *)malloc(qx_sz); // block_q8_1 * qx_res = (block_q8_1 *)malloc(qx_sz); // vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); // vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); // // for (size_t i = 0; i < ne; i++) { // x[i] = rand() / (float)RAND_MAX; // } // // vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant); // // ggml_pipeline_request_descriptor_sets(ctx, p, 1); // // ggml_pipeline_allocate_descriptor_sets(ctx); // // ggml_vk_buffer_write(x_buf, 0, x, x_sz); // // vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); // ggml_vk_ctx_begin(ctx->device, subctx); // ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, x_buf), ggml_vk_subbuffer(ctx, qx_buf), ne); // ggml_vk_ctx_end(subctx); // // auto begin = std::chrono::high_resolution_clock::now(); // // ggml_vk_submit(subctx, ctx->fence); // VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences"); // ctx->device->device.resetFences({ ctx->fence }); // ggml_vk_queue_command_pools_cleanup(ctx->device); // // auto end = std::chrono::high_resolution_clock::now(); // // double ms_quant = std::chrono::duration_cast(end-begin).count() / 1000.0; // ggml_vk_buffer_read(qx_buf, 0, qx, qx_sz); // // ggml_vk_quantize_data(x, qx_res, ne, quant); // // int first_err = -1; // // for (size_t i = 0; i < ne / 32; i++) { // double error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d)); // // if (first_err < 0 && error > 0.1) { // first_err = i; // } // // error = std::fabs(ggml_fp16_to_fp32(qx_res[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) - ggml_fp16_to_fp32(qx[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s)); // // if (first_err < 0 && error > 0.1) { // first_err = i; // } // // for (size_t j = 0; j < 32; j++) { // uint64_t error = std::abs(qx_res[i].qs[j] - qx[i].qs[j]); // // if (first_err < 0 && error > 1) { // first_err = i; // } // } // } // // std::cerr << "TEST QUANTIZE " << ggml_type_name(quant) << " time=" << ms_quant << "ms " << (first_err == -1 ? "CORRECT" : "INCORRECT") << std::endl; // // if (first_err != -1) { // std::cerr << "first_error = " << first_err << std::endl; // std::cerr << "Actual result: " << std::endl << std::endl; // std::cout << "d=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " "; // for (size_t j = 0; j < 32; j++) { // std::cout << " qs" << j << "=" << (uint32_t)qx[first_err].qs[j] << " "; // } // std::cerr << std::endl << std::endl << "Expected result: " << std::endl << std::endl; // std::cout << "d=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d) << " s=" << ggml_fp16_to_fp32(qx_res[first_err].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) << " "; // for (size_t j = 0; j < 32; j++) { // std::cout << " qs" << j << "=" << (uint32_t)qx_res[first_err].qs[j] << " "; // } // std::cerr << std::endl; // } // // ggml_vk_destroy_buffer(x_buf); // ggml_vk_destroy_buffer(qx_buf); // // free(x); // free(qx); // free(qx_res); // } static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant, bool mmq = false) { VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")"); const size_t x_ne = m * k * batch; const size_t y_ne = k * n * batch; const size_t d_ne = m * n * batch; vk_matmul_pipeline2 * pipelines; if (mmq) { pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1; } else { pipelines = ctx->device->pipeline_dequant_mul_mat_mat; } const bool fp16acc = ctx->device->fp16; vk_pipeline p; std::string shname; if (shader_size == 0) { p = fp16acc ? pipelines[quant].f16acc->a_s : pipelines[quant].f32acc->a_s; shname = std::string(ggml_type_name(quant)) + "_ALIGNED_S"; } else if (shader_size == 1) { p = fp16acc ? pipelines[quant].f16acc->a_m : pipelines[quant].f32acc->a_m; shname = std::string(ggml_type_name(quant)) + "_ALIGNED_M"; } else if (shader_size == 2) { p = fp16acc ? pipelines[quant].f16acc->a_l : pipelines[quant].f32acc->a_l; shname = std::string(ggml_type_name(quant)) + "_ALIGNED_L"; } else { GGML_ASSERT(0); } const size_t kpad = mmq ? 0 : ggml_vk_align_size(k, p->align); if (mmq || k != kpad) { if (shader_size == 0) { p = fp16acc ? pipelines[quant].f16acc->s : pipelines[quant].f32acc->s; shname = std::string(ggml_type_name(quant)) + "_S"; } else if (shader_size == 1) { p = fp16acc ? pipelines[quant].f16acc->m : pipelines[quant].f32acc->m; shname = std::string(ggml_type_name(quant)) + "_M"; } else if (shader_size == 2) { p = fp16acc ? pipelines[quant].f16acc->l : pipelines[quant].f32acc->l; shname = std::string(ggml_type_name(quant)) + "_L"; } else { GGML_ASSERT(0); } } if (p == nullptr) { std::cerr << "error: no pipeline for ggml_vk_test_dequant_matmul " << ggml_type_name(quant) << std::endl; return; } const size_t x_sz = sizeof(float) * x_ne; const size_t y_sz = sizeof(float) * y_ne; const size_t qx_sz = x_ne * ggml_type_size(quant)/ggml_blck_size(quant); const size_t qy_sz = mmq ? y_ne * ggml_type_size(GGML_TYPE_Q8_1)/ggml_blck_size(GGML_TYPE_Q8_1) : y_sz; const size_t d_sz = sizeof(float) * d_ne; float * x = (float *) malloc(x_sz); float * y = (float *) malloc(y_sz); void * qx = malloc(qx_sz); vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); vk_buffer y_buf = ggml_vk_create_buffer_check(ctx->device, y_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); vk_buffer qy_buf = ggml_vk_create_buffer_check(ctx->device, qy_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); vk_buffer d_buf = ggml_vk_create_buffer_check(ctx->device, d_sz, {vk::MemoryPropertyFlagBits::eDeviceLocal}); float * d = (float *) malloc(d_sz); float * d_chk = (float *) malloc(d_sz); for (size_t i = 0; i < x_ne; i++) { x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; // x[i] = (i % k == i / k) ? 1.0f : 0.0f; // x[i] = i % k; } ggml_vk_quantize_data(x, qx, x_ne, quant); for (size_t i = 0; i < y_ne; i++) { y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; // y[i] = (i % k == i / k) ? 1.0f : 0.0f; // y[i] = i % k; } if (split_k > 1) { ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it); if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { // Resize buffer if (ctx->prealloc_split_k != nullptr) { ggml_vk_destroy_buffer(ctx->prealloc_split_k); } ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, {vk::MemoryPropertyFlagBits::eDeviceLocal}); } } if (mmq) { vk_pipeline pipeline_quantize_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); ggml_pipeline_request_descriptor_sets(ctx, pipeline_quantize_q8_1, num_it); } ggml_pipeline_allocate_descriptor_sets(ctx); ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); ggml_vk_buffer_write(y_buf, 0, y, y_sz); vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); ggml_vk_ctx_begin(ctx->device, subctx); if (mmq) { for (size_t i = 0; i < num_it; i++) { ggml_vk_quantize_q8_1(ctx, subctx, { y_buf, 0, y_sz }, { qy_buf, 0, qy_sz }, y_ne); ggml_vk_matmul( ctx, subctx, p, { qx_buf, 0, qx_sz }, { qy_buf, 0, qy_sz }, { d_buf, 0, d_sz }, { ctx->prealloc_split_k, 0, ctx->prealloc_size_split_k }, m, n, k, k, k, m, k*m, k*n, m*n, split_k, batch, batch, batch, 1, 1, n ); } } else { for (size_t i = 0; i < num_it; i++) { ggml_vk_matmul( ctx, subctx, p, { qx_buf, 0, qx_sz }, { y_buf, 0, y_sz }, { d_buf, 0, d_sz }, { ctx->prealloc_split_k, 0, ctx->prealloc_size_split_k }, m, n, k, k, k, m, k*m, k*n, m*n, split_k, batch, batch, batch, 1, 1, n ); } } ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); ggml_vk_submit(subctx, ctx->fence); VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences"); ctx->device->device.resetFences({ ctx->fence }); ggml_vk_queue_command_pools_cleanup(ctx->device); auto end = std::chrono::high_resolution_clock::now(); double time_ms = std::chrono::duration_cast(end-begin).count() / 1000.0; ggml_vk_buffer_read(d_buf, 0, d, d_sz); ggml_init_params iparams = { /*.mem_size =*/ 1024*1024*1024, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; ggml_context * ggml_ctx = ggml_init(iparams); ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, quant, k, m, batch); ggml_tensor * src1_ggml = ggml_new_tensor_3d(ggml_ctx, GGML_TYPE_F32, k, n, batch); ggml_tensor * tensor_ggml = ggml_mul_mat(ggml_ctx, src0_ggml, src1_ggml); src0_ggml->data = qx; src1_ggml->data = y; tensor_ggml->data = d_chk; ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); ggml_build_forward_expand(cgraph, tensor_ggml); ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1); ggml_free(ggml_ctx); double avg_err = 0.0; int first_err_n = -1; int first_err_m = -1; int first_err_b = -1; for (size_t i = 0; i < m*n*batch; i++) { double err = std::fabs(d[i] - d_chk[i]); avg_err += err; if ((err > 0.05f || std::isnan(err)) && first_err_n == -1) { first_err_b = i / (m * n); first_err_n = (i % (m * n)) / m; first_err_m = (i % (m * n)) % m; } } avg_err /= m * n; double tflops = 2.0*m*n*k*batch*num_it / (time_ms / 1000.0) / (1000.0*1000.0*1000.0*1000.0); std::cerr << "TEST dequant matmul " << shname; if (mmq) { std::cerr << " mmq"; } std::cerr << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl; if (avg_err > 0.01 || std::isnan(avg_err)) { std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl; std::cerr << "Actual result: " << std::endl << std::endl; ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); std::cerr << std::endl; std::cerr << "Expected result: " << std::endl << std::endl; ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); std::cerr << "src0: " << std::endl << std::endl; ggml_vk_print_matrix_area(x, GGML_TYPE_F32, k, m, first_err_m, first_err_n, first_err_b); std::cerr << std::endl; std::cerr << "src1: " << std::endl << std::endl; ggml_vk_print_matrix_area(y, GGML_TYPE_F32, k, n, first_err_m, first_err_n, first_err_b); if (split_k > 1) { float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k); ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k); std::cerr << "d_buf0: " << std::endl << std::endl; ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); std::cerr << "d_buf1: " << std::endl << std::endl; ggml_vk_print_matrix_area(split_k_buf + d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); std::cerr << "d_buf2: " << std::endl << std::endl; ggml_vk_print_matrix_area(split_k_buf + 2 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); std::cerr << "d_buf3: " << std::endl << std::endl; ggml_vk_print_matrix_area(split_k_buf + 3 * d_ne, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); free(split_k_buf); } } ggml_vk_destroy_buffer(qx_buf); ggml_vk_destroy_buffer(y_buf); ggml_vk_destroy_buffer(qy_buf); ggml_vk_destroy_buffer(d_buf); free(x); free(qx); free(y); free(d); free(d_chk); } #endif static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx) { #if defined(GGML_VULKAN_RUN_TESTS) const std::vector vals { 512, 512, 128, 128, 512, 512, 4096, 512, 4096, 11008, 512, 4096, 4096, 512, 11008, 32000, 512, 4096, 8, 8, 8, 100, 46, 576, 623, 111, 128, 100, 46, 558, 512, 1, 256, 128, 110, 622, 511, 511, 127, 511, 511, 7, 511, 511, 17, 49, 49, 128, 128, 49, 49, 4096, 49, 4096, }; const size_t num_it = 100; ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q4_0); ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q4_0); ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q4_0); ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q4_0, true); ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q4_0, true); ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q4_0, true); ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q8_0); ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q8_0); ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q8_0); ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 0, GGML_TYPE_Q8_0, true); ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 1, GGML_TYPE_Q8_0, true); ggml_vk_test_dequant_matmul(ctx, 4096, 512, 4096, 2, num_it, 1, 2, GGML_TYPE_Q8_0, true); abort(); for (size_t i = 0; i < vals.size(); i += 3) { ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0); ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1); ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2); std::cerr << '\n'; ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0); ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1); ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2); std::cerr << '\n'; ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0); ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1); ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2); std::cerr << '\n' << std::endl; if (vals[i + 2] % 32 == 0) { ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0, GGML_TYPE_Q4_0); ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1, GGML_TYPE_Q4_0); ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2, GGML_TYPE_Q4_0); std::cerr << '\n'; ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0, GGML_TYPE_Q4_0); ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1, GGML_TYPE_Q4_0); ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2, GGML_TYPE_Q4_0); std::cerr << '\n'; ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0, GGML_TYPE_Q4_0); ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1, GGML_TYPE_Q4_0); ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2, GGML_TYPE_Q4_0); std::cerr << '\n' << std::endl; } if (vals[i + 2] % 256 == 0) { ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0, GGML_TYPE_Q4_K); ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1, GGML_TYPE_Q4_K); ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2, GGML_TYPE_Q4_K); std::cerr << '\n'; ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0, GGML_TYPE_Q4_K); ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1, GGML_TYPE_Q4_K); ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2, GGML_TYPE_Q4_K); std::cerr << '\n'; ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0, GGML_TYPE_Q4_K); ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1, GGML_TYPE_Q4_K); ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2, GGML_TYPE_Q4_K); std::cerr << '\n' << std::endl; } } GGML_ABORT("fatal error"); #endif if (subctx) { // Submit and wait for any pending work before reallocating the buffers ggml_vk_ctx_end(subctx); ggml_vk_submit(subctx, {}); ctx->submit_pending = true; ggml_vk_synchronize(ctx); GGML_ASSERT(ctx->compute_ctx.expired()); ggml_vk_ctx_begin(ctx->device, subctx); ctx->compute_ctx = subctx; } if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) { VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")"); // Resize buffer if (ctx->prealloc_x != nullptr) { ggml_vk_destroy_buffer(ctx->prealloc_x); } ctx->prealloc_x = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_x); } if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) { VK_LOG_MEMORY("ggml_vk_preallocate_buffers(y_size: " << ctx->prealloc_size_y << ")"); // Resize buffer if (ctx->prealloc_y != nullptr) { ggml_vk_destroy_buffer(ctx->prealloc_y); } ctx->prealloc_y = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_y); ctx->prealloc_y_last_tensor_used = nullptr; } if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) { VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")"); // Resize buffer if (ctx->prealloc_split_k != nullptr) { ggml_vk_destroy_buffer(ctx->prealloc_split_k); } ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_split_k); } if (ctx->prealloc_add_rms_partials == nullptr || (ctx->prealloc_size_add_rms_partials > 0 && ctx->prealloc_add_rms_partials->size < ctx->prealloc_size_add_rms_partials)) { VK_LOG_MEMORY("ggml_vk_preallocate_buffers(add_partials_size: " << ctx->prealloc_add_rms_partials << ")"); // Resize buffer if (ctx->prealloc_add_rms_partials != nullptr) { ggml_vk_destroy_buffer(ctx->prealloc_add_rms_partials); } ctx->prealloc_add_rms_partials = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_add_rms_partials); } } static void ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool almost_ready); // Returns true if node has enqueued work into the queue, false otherwise // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool last_node, bool almost_ready, bool submit){ ggml_tensor * node = cgraph->nodes[node_idx]; if (ggml_is_empty(node) || ggml_op_is_empty(node->op) || !node->buffer) { return false; } if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { return false; } VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")"); ctx->semaphore_idx = 0; ggml_tensor * src0 = node->src[0]; ggml_tensor * src1 = node->src[1]; ggml_tensor * src2 = node->src[2]; ggml_tensor * src3 = node->src[3]; if (node->op == GGML_OP_ADD) { int next_node_idx = node_idx + 1 + ctx->num_additional_fused_ops; if (next_node_idx < cgraph->n_nodes && cgraph->nodes[next_node_idx]->op == GGML_OP_RMS_NORM && cgraph->nodes[next_node_idx]->src[0] == cgraph->nodes[next_node_idx - 1] && ggml_nrows(cgraph->nodes[next_node_idx]) == 1 && ctx->device->add_rms_fusion) { uint32_t size = ggml_vk_rms_partials_size(ctx, cgraph->nodes[node_idx]); ctx->do_add_rms_partials_offset_calculation = true; if (ctx->prealloc_size_add_rms_partials_offset + size <= ctx->prealloc_size_add_rms_partials) { ctx->do_add_rms_partials = true; } } } vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); { // This logic detects dependencies between modes in the graph and calls ggml_vk_sync_buffers // to synchronize them. This handles most "normal" synchronization when computing the graph, and when // there is no auxiliary memory use, it shouldn't be necessary to call ggml_vk_sync_buffers // outside of this logic. When a node uses one of the prealloc buffers for something like // dequantization or split_k, additional synchronization is needed between those passes. bool need_sync = false; // Check whether "node" requires synchronization. The node requires synchronization if it // overlaps in memory with another unsynchronized node and at least one of them is a write. // Destination nodes are checked against both the written/read lists. Source nodes are only // checked against the written list. Two nodes overlap in memory if they come from the same // buffer and the tensor or view ranges overlap. auto const &overlaps_unsynced = [&](const ggml_tensor *node, const std::vector &unsynced_nodes) -> bool { if (unsynced_nodes.size() == 0) { return false; } auto n_base = vk_tensor_offset(node) + node->view_offs; auto n_size = ggml_nbytes(node); ggml_backend_vk_buffer_context * a_buf_ctx = (ggml_backend_vk_buffer_context *)node->buffer->context; vk_buffer a_buf = a_buf_ctx->dev_buffer; for (auto &other : unsynced_nodes) { ggml_backend_vk_buffer_context * o_buf_ctx = (ggml_backend_vk_buffer_context *)other->buffer->context; vk_buffer o_buf = o_buf_ctx->dev_buffer; if (a_buf == o_buf) { auto o_base = vk_tensor_offset(other) + other->view_offs; auto o_size = ggml_nbytes(other); if ((o_base <= n_base && n_base < o_base + o_size) || (n_base <= o_base && o_base < n_base + n_size)) { return true; } } } return false; }; // For all fused ops, check if the destination node or any of the source // nodes require synchronization. for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1 && !need_sync; ++i) { const ggml_tensor *cur_node = cgraph->nodes[node_idx + i]; // If the node actually writes to memory, then check if it needs to sync if (ctx->fused_ops_write_mask & (1 << i)) { if (overlaps_unsynced(cur_node, ctx->unsynced_nodes_read) || overlaps_unsynced(cur_node, ctx->unsynced_nodes_written)) { need_sync = true; break; } } for (uint32_t j = 0; j < GGML_MAX_SRC; ++j) { if (!cur_node->src[j]) { continue; } if (overlaps_unsynced(cur_node->src[j], ctx->unsynced_nodes_written)) { need_sync = true; break; } } } if (need_sync) { if (vk_enable_sync_logger) { std::cerr << "sync" << std::endl; } ctx->unsynced_nodes_written.clear(); ctx->unsynced_nodes_read.clear(); ggml_vk_sync_buffers(ctx, compute_ctx); if (vk_perf_logger_enabled && vk_perf_logger_concurrent) { ctx->query_node_idx[ctx->query_idx] = node_idx; compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); ggml_vk_sync_buffers(ctx, compute_ctx); } } // Add all fused nodes to the unsynchronized lists. for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) { const ggml_tensor *cur_node = cgraph->nodes[node_idx + i]; // Multiple outputs could be written, e.g. in topk_moe. Add them all to the list. if (ctx->fused_ops_write_mask & (1 << i)) { ctx->unsynced_nodes_written.push_back(cur_node); } for (uint32_t j = 0; j < GGML_MAX_SRC; ++j) { if (!cur_node->src[j]) { continue; } ctx->unsynced_nodes_read.push_back(cur_node->src[j]); } } } if (vk_enable_sync_logger) { for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) { auto *n = cgraph->nodes[node_idx + i]; std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name; if (n->op == GGML_OP_GLU) { std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " "; } if (n->op == GGML_OP_ROPE) { const int mode = ((const int32_t *) n->op_params)[2]; std::cerr << " rope mode: " << mode; } std::cerr << std::endl; } } switch (node->op) { case GGML_OP_REPEAT: ggml_vk_repeat(ctx, compute_ctx, src0, node); break; case GGML_OP_REPEAT_BACK: ggml_vk_repeat_back(ctx, compute_ctx, src0, node); break; case GGML_OP_ACC: case GGML_OP_SET: ggml_vk_acc(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_GET_ROWS: ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_ADD: if (ctx->num_additional_fused_ops) { ggml_vk_multi_add(ctx, compute_ctx, cgraph, node_idx); } else { ggml_vk_add(ctx, compute_ctx, src0, src1, node); } break; case GGML_OP_SUB: ggml_vk_sub(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_MUL: ggml_vk_mul(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_DIV: ggml_vk_div(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_ADD_ID: ggml_vk_add_id(ctx, compute_ctx, src0, src1, src2, node); break; case GGML_OP_CONCAT: ggml_vk_concat(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_UPSCALE: ggml_vk_upscale(ctx, compute_ctx, src0, node); break; case GGML_OP_ADD1: ggml_vk_add1(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_ARANGE: ggml_vk_arange(ctx, compute_ctx, node); break; case GGML_OP_FILL: ggml_vk_fill(ctx, compute_ctx, node); break; case GGML_OP_SCALE: ggml_vk_scale(ctx, compute_ctx, src0, node); break; case GGML_OP_SQR: ggml_vk_sqr(ctx, compute_ctx, src0, node); break; case GGML_OP_SQRT: ggml_vk_sqrt(ctx, compute_ctx, src0, node); break; case GGML_OP_SIN: ggml_vk_sin(ctx, compute_ctx, src0, node); break; case GGML_OP_COS: ggml_vk_cos(ctx, compute_ctx, src0, node); break; case GGML_OP_LOG: ggml_vk_log(ctx, compute_ctx, src0, node); break; case GGML_OP_TRI: ggml_vk_tri(ctx, compute_ctx, src0, node); break; case GGML_OP_DIAG: ggml_vk_diag(ctx, compute_ctx, src0, node); break; case GGML_OP_CLAMP: ggml_vk_clamp(ctx, compute_ctx, src0, node); break; case GGML_OP_PAD: ggml_vk_pad(ctx, compute_ctx, src0, node); break; case GGML_OP_ROLL: ggml_vk_roll(ctx, compute_ctx, src0, node); break; case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: ggml_vk_cpy(ctx, compute_ctx, src0, node); break; case GGML_OP_SET_ROWS: ggml_vk_set_rows(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_SILU_BACK: ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_NORM: ggml_vk_norm(ctx, compute_ctx, src0, node); break; case GGML_OP_GROUP_NORM: ggml_vk_group_norm(ctx, compute_ctx, src0, node); break; case GGML_OP_RMS_NORM: ggml_vk_rms_norm(ctx, compute_ctx, cgraph, node_idx, (float *)node->op_params); break; case GGML_OP_RMS_NORM_BACK: ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_L2_NORM: ggml_vk_l2_norm(ctx, compute_ctx, src0, node); break; case GGML_OP_UNARY: if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) { ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx); break; } switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_EXP: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_NEG: case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_SIGMOID: case GGML_UNARY_OP_HARDSIGMOID: case GGML_UNARY_OP_HARDSWISH: case GGML_UNARY_OP_ABS: case GGML_UNARY_OP_SOFTPLUS: case GGML_UNARY_OP_STEP: case GGML_UNARY_OP_ROUND: case GGML_UNARY_OP_CEIL: case GGML_UNARY_OP_FLOOR: case GGML_UNARY_OP_TRUNC: case GGML_UNARY_OP_SGN: ggml_vk_unary(ctx, compute_ctx, src0, node); break; case GGML_UNARY_OP_XIELU: ggml_vk_xielu(ctx, compute_ctx, src0, node); break; default: return false; } break; case GGML_OP_GLU: switch (ggml_get_glu_op(node)) { case GGML_GLU_OP_GEGLU: case GGML_GLU_OP_REGLU: case GGML_GLU_OP_SWIGLU: case GGML_GLU_OP_SWIGLU_OAI: case GGML_GLU_OP_GEGLU_ERF: case GGML_GLU_OP_GEGLU_QUICK: ggml_vk_glu(ctx, compute_ctx, src0, src1, node); break; default: return false; } break; case GGML_OP_DIAG_MASK_INF: ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node); break; case GGML_OP_SOFT_MAX: if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) { ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx); } else { ggml_vk_soft_max(ctx, compute_ctx, src0, src1, src2, node); } break; case GGML_OP_SOFT_MAX_BACK: ggml_vk_soft_max_back(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_ROPE: ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, false); break; case GGML_OP_ROPE_BACK: ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, true); break; case GGML_OP_ARGSORT: if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) { ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx); } else { ggml_vk_argsort(ctx, compute_ctx, src0, node); } break; case GGML_OP_TOP_K: ggml_vk_topk(ctx, compute_ctx, src0, node); break; case GGML_OP_SUM: ggml_vk_sum(ctx, compute_ctx, src0, node); break; case GGML_OP_SUM_ROWS: ggml_vk_sum_rows(ctx, compute_ctx, src0, node); break; case GGML_OP_CUMSUM: ggml_vk_cumsum(ctx, compute_ctx, src0, node); break; case GGML_OP_MEAN: ggml_vk_mean(ctx, compute_ctx, src0, node); break; case GGML_OP_ARGMAX: ggml_vk_argmax(ctx, compute_ctx, src0, node); break; case GGML_OP_COUNT_EQUAL: ggml_vk_count_equal(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_SOLVE_TRI: ggml_vk_solve_tri(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_IM2COL: ggml_vk_im2col(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_IM2COL_3D: ggml_vk_im2col_3d(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_TIMESTEP_EMBEDDING: ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node); break; case GGML_OP_CONV_TRANSPOSE_1D: ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_POOL_2D: ggml_vk_pool_2d(ctx, compute_ctx, src0, node); break; case GGML_OP_CONV_2D: case GGML_OP_CONV_TRANSPOSE_2D: ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_CONV_2D_DW: ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_LEAKY_RELU: ggml_vk_leaky_relu(ctx, compute_ctx, src0, node); break; case GGML_OP_MUL_MAT: ggml_vk_mul_mat(ctx, compute_ctx, cgraph, node_idx); break; case GGML_OP_MUL_MAT_ID: ggml_vk_mul_mat_id(ctx, compute_ctx, cgraph, node_idx); break; case GGML_OP_FLASH_ATTN_EXT: ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node->src[4], node); break; case GGML_OP_RWKV_WKV6: ggml_vk_rwkv_wkv6(ctx, compute_ctx, node); break; case GGML_OP_RWKV_WKV7: ggml_vk_rwkv_wkv7(ctx, compute_ctx, node); break; case GGML_OP_GATED_DELTA_NET: ggml_vk_gated_delta_net(ctx, compute_ctx, node); break; case GGML_OP_SSM_SCAN: ggml_vk_ssm_scan(ctx, compute_ctx, node); break; case GGML_OP_SSM_CONV: ggml_vk_ssm_conv(ctx, compute_ctx, node); break; case GGML_OP_OPT_STEP_ADAMW: ggml_vk_opt_step_adamw(ctx, compute_ctx, node); break; case GGML_OP_OPT_STEP_SGD: ggml_vk_opt_step_sgd(ctx, compute_ctx, src0, src1, src2, node); break; default: return false; } ctx->tensor_ctxs[node_idx] = compute_ctx; #if defined(GGML_VULKAN_CHECK_RESULTS) // Force context reset on each node so that each tensor ends up in its own context // and can be run and compared to its CPU equivalent separately last_node = true; #endif if (submit || last_node) { ggml_vk_ctx_end(compute_ctx); // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward if (last_node) { compute_ctx->exit_tensor_idx = node_idx_begin; } else { compute_ctx->exit_tensor_idx = -1; } ctx->compute_ctx.reset(); ggml_vk_compute_forward(ctx, cgraph, node_begin, node_idx_begin, almost_ready); } return true; } static void ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, ggml_tensor * tensor, int tensor_idx, bool almost_ready = false) { GGML_UNUSED(cgraph); GGML_UNUSED(tensor); VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")"); vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock(); // Only run if ctx hasn't been submitted yet if (!subctx->seqs.empty()) { #ifdef GGML_VULKAN_CHECK_RESULTS ggml_vk_check_results_0(ctx, cgraph, tensor_idx); #endif // Do staging buffer copies for (auto& cpy : subctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } for (auto& mset : subctx->memsets) { memset(mset.dst, mset.val, mset.n); } if (almost_ready && !ctx->almost_ready_fence_pending) { ggml_vk_submit(subctx, ctx->almost_ready_fence); ctx->almost_ready_fence_pending = true; } else { ggml_vk_submit(subctx, {}); } ctx->submit_pending = true; #ifdef GGML_VULKAN_CHECK_RESULTS ggml_vk_synchronize(ctx); ggml_vk_check_results_1(ctx, cgraph, tensor_idx); #endif } if (tensor_idx == subctx->exit_tensor_idx) { // Do staging buffer copies for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } subctx->in_memcpys.clear(); subctx->out_memcpys.clear(); subctx->memsets.clear(); } } // Clean up after graph processing is done static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { VK_LOG_DEBUG("ggml_vk_graph_cleanup()"); ctx->prealloc_y_last_pipeline_used = {}; ctx->unsynced_nodes_written.clear(); ctx->unsynced_nodes_read.clear(); ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false; ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool); if (ctx->device->async_use_transfer_queue) { ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool); } for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) { ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s }); } ctx->gc.semaphores.clear(); for (size_t i = 0; i < ctx->gc.tl_semaphores.size(); i++) { ctx->device->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s }); } ctx->gc.tl_semaphores.clear(); ctx->semaphore_idx = 0; ctx->event_idx = 0; for (auto& event : ctx->gc.events) { ctx->device->device.resetEvent(event); } ctx->tensor_ctxs.clear(); ctx->gc.contexts.clear(); ctx->pipeline_descriptor_set_requirements = 0; ctx->descriptor_set_idx = 0; } // Clean up on backend free static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->name << ")"); // discard any unsubmitted command buffers ctx->compute_ctx.reset(); // wait for any pending command buffers to finish ggml_vk_synchronize(ctx); ggml_vk_graph_cleanup(ctx); ggml_vk_destroy_buffer(ctx->prealloc_x); ggml_vk_destroy_buffer(ctx->prealloc_y); ggml_vk_destroy_buffer(ctx->prealloc_split_k); ggml_vk_destroy_buffer(ctx->prealloc_add_rms_partials); ggml_vk_destroy_buffer(ctx->sync_staging); ctx->prealloc_y_last_pipeline_used = nullptr; ctx->prealloc_size_x = 0; ctx->prealloc_size_y = 0; ctx->prealloc_size_split_k = 0; for (auto& event : ctx->gc.events) { ctx->device->device.destroyEvent(event); } ctx->gc.events.clear(); ctx->device->device.destroyFence(ctx->fence); ctx->device->device.destroyFence(ctx->almost_ready_fence); for (auto& pool : ctx->descriptor_pools) { ctx->device->device.destroyDescriptorPool(pool); } ctx->descriptor_pools.clear(); ctx->descriptor_sets.clear(); ctx->compute_cmd_pool.destroy(ctx->device->device); if (ctx->device->async_use_transfer_queue) { ctx->device->device.destroySemaphore(ctx->transfer_semaphore.s); ctx->transfer_cmd_pool.destroy(ctx->device->device); } if (vk_perf_logger_enabled) { ctx->perf_logger->print_timings(true); } } static int ggml_vk_get_device_count() { ggml_vk_instance_init(); return vk_instance.device_indices.size(); } static void ggml_vk_get_device_description(int device, char * description, size_t description_size) { ggml_vk_instance_init(); std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); vk::PhysicalDeviceProperties props; devices[device].getProperties(&props); snprintf(description, description_size, "%s", props.deviceName.data()); } // backend interface #define UNUSED GGML_UNUSED // device backend static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { return buffer->buft->iface.get_name == ggml_backend_vk_buffer_type_name; } static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()"); ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_destroy_buffer(ctx->dev_buffer); delete ctx; } static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { return vk_ptr_base; UNUSED(buffer); } static enum ggml_status ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")"); if (tensor->view_src != nullptr) { GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); } return GGML_STATUS_SUCCESS; } static void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { VK_LOG_DEBUG("ggml_backend_vk_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; vk_buffer buf = buf_ctx->dev_buffer; if (size == 0) { return; } uint32_t val32 = (uint32_t)value * 0x01010101; ggml_vk_buffer_memset(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, val32, size); } static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; vk_buffer buf = buf_ctx->dev_buffer; if (size == 0) { return; } ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } static void ggml_backend_vk_buffer_set_tensor_2d(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) { VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor_2d(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ", " << n_copies << ", " << stride_tensor << ", " << stride_data << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; vk_buffer buf = buf_ctx->dev_buffer; if (size == 0) { return; } ggml_vk_buffer_write_2d(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, stride_data, stride_tensor, size, n_copies); } static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; if (size == 0) { return; } vk_buffer buf = buf_ctx->dev_buffer; ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); } static void ggml_backend_vk_buffer_get_tensor_2d(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) { VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor_2d(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ", " << n_copies << ", " << stride_tensor << ", " << stride_data << ")"); ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; if (size == 0) { return; } vk_buffer buf = buf_ctx->dev_buffer; ggml_vk_buffer_read_2d(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, stride_tensor, stride_data, size, n_copies); } static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { if (ggml_nbytes(src) == 0) { return true; } if (ggml_backend_buffer_is_vk(src->buffer)) { ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context; ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; vk_buffer src_buf = src_buf_ctx->dev_buffer; vk_buffer dst_buf = dst_buf_ctx->dev_buffer; ggml_vk_buffer_copy(dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src)); return true; } return false; UNUSED(buffer); } static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); } static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { /* .free_buffer = */ ggml_backend_vk_buffer_free_buffer, /* .get_base = */ ggml_backend_vk_buffer_get_base, /* .init_tensor = */ ggml_backend_vk_buffer_init_tensor, /* .memset_tensor = */ ggml_backend_vk_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_vk_buffer_set_tensor, /* .get_tensor = */ ggml_backend_vk_buffer_get_tensor, /* .set_tensor_2d = */ ggml_backend_vk_buffer_set_tensor_2d, /* .get_tensor_2d = */ ggml_backend_vk_buffer_get_tensor_2d, /* .cpy_tensor = */ ggml_backend_vk_buffer_cpy_tensor, /* .clear = */ ggml_backend_vk_buffer_clear, /* .reset = */ NULL, }; // vk buffer type static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) { ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *)buft->context; return ctx->name.c_str(); } static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")"); ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; vk_buffer dev_buffer = nullptr; try { dev_buffer = ggml_vk_create_buffer_device(ctx->device, size); } catch (const vk::SystemError& e) { return nullptr; } ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->device, std::move(dev_buffer), ctx->name); return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size); } static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; return ctx->device->properties.limits.minStorageBufferOffsetAlignment; } static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; return ctx->device->suballocation_block_size; } static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { return ggml_nbytes(tensor); UNUSED(buft); } ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { ggml_vk_instance_init(); VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")"); vk_device dev = ggml_vk_get_device(dev_num); return &dev->buffer_type; } // host buffer type static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_type_t buft) { return GGML_VK_NAME "_Host"; UNUSED(buft); } static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) { return GGML_VK_NAME "_Host"; UNUSED(buffer); } static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); ggml_vk_host_free(vk_instance.devices[0], buffer->context); } static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")"); size += 32; // Behave like the CPU buffer type void * ptr = nullptr; try { ptr = ggml_vk_host_malloc(vk_instance.devices[0], size); } catch (vk::SystemError& e) { GGML_LOG_WARN("ggml_vulkan: Failed to allocate pinned memory (%s)\n", e.what()); // fallback to cpu buffer return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); } ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer; return buffer; UNUSED(buft); } static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return vk_instance.devices[0]->properties.limits.minMemoryMapAlignment; UNUSED(buft); } static size_t ggml_backend_vk_host_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { return vk_instance.devices[0]->suballocation_block_size; UNUSED(buft); } // Should be changed to return device-specific host buffer type // but that probably requires changes in llama.cpp ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_vk_buffer_type_host = { /* .iface = */ { /* .get_name = */ ggml_backend_vk_host_buffer_type_name, /* .alloc_buffer = */ ggml_backend_vk_host_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_vk_host_buffer_type_get_max_size, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), 0), /* .context = */ nullptr, }; // Make sure device 0 is initialized ggml_vk_instance_init(); ggml_vk_get_device(0); return &ggml_backend_vk_buffer_type_host; } // backend static const char * ggml_backend_vk_name(ggml_backend_t backend) { ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; return ctx->name.c_str(); } static void ggml_backend_vk_free(ggml_backend_t backend) { ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")"); ggml_vk_cleanup(ctx); delete ctx; delete backend; } static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; return &ctx->device->buffer_type; } static void ggml_backend_vk_set_tensor_2d_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) { VK_LOG_DEBUG("ggml_backend_vk_set_tensor_2d_async(" << size << ", " << n_copies << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); if (size == 0) { return; } ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; vk_context cpy_ctx; if (ctx->device->async_use_transfer_queue) { if (ctx->transfer_ctx.expired()) { cpy_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool); ctx->transfer_ctx = cpy_ctx; ggml_vk_ctx_begin(ctx->device, cpy_ctx); } else { cpy_ctx = ctx->transfer_ctx.lock(); } } else { cpy_ctx = ggml_vk_get_compute_ctx(ctx); } vk_buffer buf = buf_ctx->dev_buffer; auto dst_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset; bool ret = ggml_vk_buffer_write_2d_async(cpy_ctx, buf, dst_offset, data, stride_data, stride_tensor, size, n_copies); if (!ret) { const size_t staging_size = size * n_copies; ggml_vk_ensure_sync_staging_buffer(ctx, staging_size); ggml_vk_sync_buffers(nullptr, cpy_ctx); std::vector slices(1); if (size == stride_tensor) { slices[0].srcOffset = 0; slices[0].dstOffset = dst_offset; slices[0].size = staging_size; } else { slices.resize(n_copies); for (size_t i = 0; i < n_copies; i++) { slices[i].srcOffset = i * size; slices[i].dstOffset = dst_offset + i * stride_tensor; slices[i].size = size; } } cpy_ctx->s->buffer->buf.copyBuffer(ctx->sync_staging->buffer, buf->buffer, slices); if (size == stride_data) { deferred_memcpy(ctx->sync_staging->ptr, data, staging_size, &cpy_ctx->in_memcpys); } else { for (size_t i = 0; i < n_copies; i++) { deferred_memcpy((uint8_t *)ctx->sync_staging->ptr + i * size, (const uint8_t *)data + i * stride_data, size, &cpy_ctx->in_memcpys); } } ggml_vk_synchronize(ctx); } } static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")"); ggml_backend_vk_set_tensor_2d_async(backend, tensor, data, offset, size, 1, size, size); } static void ggml_backend_vk_get_tensor_2d_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) { VK_LOG_DEBUG("ggml_backend_vk_get_tensor_2d_async(" << size << ", " << n_copies << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); if (size == 0) { return; } ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); vk_buffer buf = buf_ctx->dev_buffer; auto src_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset; bool ret = ggml_vk_buffer_read_2d_async(compute_ctx, buf, src_offset, data, stride_tensor, stride_data, size, n_copies); if (!ret) { const size_t staging_size = size * n_copies; ggml_vk_ensure_sync_staging_buffer(ctx, staging_size); ggml_vk_sync_buffers(nullptr, compute_ctx); std::vector