#include "utils.h" #include "ggml-impl.h" #include "ggml-openvino-extra.h" #include "ggml-openvino/ggml-decoder.h" #include "ggml.h" #include "openvino/frontend.h" #include "openvino/input_model.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // Suppress deprecation warning for ov::Tensor::data() #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" enum ggml_status ov_graph_compute(ggml_cgraph * cgraph, ggml_backend_t backend) { ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context; try { if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { std::string filename = "cgraph_ov.txt"; GgmlOvDecoder::dump_cgraph(cgraph, filename); } const auto is_static = ggml_openvino_is_npu(); GGML_ASSERT(ctx->runtime_context != nullptr); std::shared_ptr r_ctx = std::static_pointer_cast(ctx->runtime_context); return is_static ? ov_graph_compute_static(cgraph, r_ctx) : ov_graph_compute_dynamic(cgraph, r_ctx); } catch (const ov::Exception & e) { GGML_LOG_ERROR("GGML OpenVINO backend ov::Exception: %s\n", e.what()); return GGML_STATUS_FAILED; } catch (const std::exception & e) { GGML_LOG_ERROR("GGML OpenVINO backend std::exception: %s\n", e.what()); return GGML_STATUS_FAILED; } catch (...) { GGML_LOG_ERROR("GGML OpenVINO backend unknown exception\n"); return GGML_STATUS_FAILED; } } ov::Tensor create_ov_output_tensor(std::shared_ptr ggml_decoder, std::shared_ptr infer_request, int output_index, const ggml_tensor * ggml_tensor) { auto output_type = ggml_decoder->get_ov_type(ggml_tensor); ov::Shape output_shape; if (ggml_decoder->is_static()) { output_shape = infer_request->get_output_tensor(output_index).get_shape(); } else { output_shape = ggml_decoder->get_shape(ggml_tensor); } ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); return output_tensor; } enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr r_ctx) { auto & core = ov_singleton_core(); const auto & config = ggml_openvino_get_compile_config(); const auto & device = r_ctx->device; const auto & stateful = r_ctx->stateful; static auto is_static = false; if (is_naive(cgraph)) { return naive_compute(cgraph, core, device, config); } auto start_time = ggml_time_us(); std::shared_ptr ggml_decoder; std::shared_ptr infer_request; ModelParams m_params; ComputeParams c_params; std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static); graph_key key(cgraph); bool cache_hit; int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; int64_t infer_end_time; { std::shared_ptr entry; ModelParams old_m_params; { std::lock_guard map_lock(r_ctx->ctx_mutex); auto it = r_ctx->decoder_cache.find(key); cache_hit = it != r_ctx->decoder_cache.end(); if (cache_hit) { entry = it->second; } else { auto mutex = std::make_shared(); entry = std::make_shared(mutex); r_ctx->decoder_cache[key] = entry; } } std::lock_guard lock(*(entry->mutex)); if (cache_hit) { ggml_decoder = entry->ptr; old_m_params = ggml_decoder->get_model_params(); cache_hit = old_m_params.can_reuse_dynamically(m_params); } if (cache_hit) { std::map> model_weights; ggml_decoder->set_compute_params(c_params); ggml_decoder->set_model_params(m_params); if (old_m_params.kv_buffer_changed(m_params)) { ggml_decoder->update_io(cgraph); } ggml_decoder->add_extra_inputs(); { std::lock_guard map_lock(r_ctx->ctx_mutex); infer_request = r_ctx->infer_request_cache.at(key); } if (stateful) { const auto * inp_pos = get_inp_pos_tensor(cgraph); int32_t * pos_data = (int32_t *) inp_pos->data; auto pos_shape = ggml_decoder->get_shape(inp_pos); if (pos_data[0] == 0) { infer_request->reset_state(); r_ctx->stateful_kv_size = pos_shape[3]; } else if (r_ctx->stateful_kv_size == static_cast(pos_data[0])) { r_ctx->stateful_kv_size += pos_shape[3]; } else { auto states = infer_request->query_state(); for (auto state : states) { auto state_tensor = state.get_state(); auto state_tensor_shape = state_tensor.get_shape(); if (static_cast(pos_data[0]) > r_ctx->stateful_kv_size) { std::string state_name; try { state_name = r_ctx->kv_state_input_name_map.at(state.get_name()); } catch (...) { GGML_LOG_ERROR("GGML OpenVINO backend stateful inference failed: no input found for the state\n"); return GGML_STATUS_FAILED; } auto kv_tensor = get_ov_input_tensor(ggml_decoder, state_name); kv_tensor.set_shape({state_tensor_shape[0], kv_tensor.get_shape()[2], state_tensor_shape[2], state_tensor_shape[3]}); state_tensor = kv_tensor; state_tensor_shape = state_tensor.get_shape(); } ov::Coordinate begin = {0, 0, 0, 0}; ov::Coordinate end = {state_tensor_shape[0], static_cast(pos_data[0]), state_tensor_shape[2], state_tensor_shape[3]}; ov::Tensor new_state_tensor(state_tensor, begin, end); state.set_state(new_state_tensor); } r_ctx->stateful_kv_size = pos_data[0] + 1; } } decoder_end_time = ggml_time_us(); conversion_end_time = decoder_end_time; compile_end_time = decoder_end_time; } else { { std::lock_guard map_lock(r_ctx->ctx_mutex); r_ctx->infer_request_cache.erase(key); } std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, stateful); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); model = ov::frontend::ggml::FrontEnd::convert(input_model); ggml_decoder->clear_model_weights(); conversion_end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; auto timestamp = (long long) ggml_time_us(); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); ov::serialize(model, timestamped_filename); } ov::CompiledModel compiled_model; auto remote_context = ggml_openvino_get_remote_context(); if (remote_context.has_value()) { compiled_model = core.compile_model(model, remote_context.value(), config); } else { compiled_model = core.compile_model(model, device, config); } compile_end_time = ggml_time_us(); infer_request = std::make_shared(compiled_model.create_infer_request()); entry->ptr = ggml_decoder; std::vector ov_input_names; std::vector ov_output_names; for (const auto & ov_param : model->get_parameters()) { ov_input_names.push_back(ov_param->get_friendly_name()); } for (const auto & ov_output : model->get_results()) { ov_output_names.push_back(ov_output->get_friendly_name()); } { std::lock_guard map_lock(r_ctx->ctx_mutex); r_ctx->infer_request_cache[key] = infer_request; r_ctx->ov_input_names_cache[key] = std::move(ov_input_names); r_ctx->ov_output_names_cache[key] = std::move(ov_output_names); } if (stateful) { const auto * inp_pos = get_inp_pos_tensor(cgraph); auto pos_shape = ggml_decoder->get_shape(inp_pos); r_ctx->stateful_kv_size = pos_shape[3]; const auto kv_param_res_names = ggml_decoder->get_kv_param_res_names(); for (const auto& pair : kv_param_res_names) { r_ctx->kv_state_input_name_map[pair.first+pair.second] = pair.first; } } } std::vector ov_input_names; std::vector ov_output_names; { std::lock_guard map_lock(r_ctx->ctx_mutex); ov_input_names = r_ctx->ov_input_names_cache[key]; ov_output_names = r_ctx->ov_output_names_cache[key]; } for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); infer_request->set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { print_input_tensor_info(param_name, input_tensor); } } for (size_t i = 0; i < ov_output_names.size(); i++) { auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]); auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor); infer_request->set_output_tensor(i, output_tensor); } infer_request->infer(); infer_end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { for (size_t i = 0; i < ov_output_names.size(); i++) { const auto output_tensor = infer_request->get_output_tensor(i); print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); } } if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("\nGGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph decoder time: %ld ms \n", (decoder_end_time - start_time) / 1000); if (!cache_hit) { GGML_LOG_INFO(" - Graph conversion time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); GGML_LOG_INFO(" - Graph compile time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); } GGML_LOG_INFO(" - Graph inference time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); } } return GGML_STATUS_SUCCESS; } enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr r_ctx) { auto & core = ov_singleton_core(); auto get_prefill_chunk_size = [] { const char * chunk_size_str = getenv("GGML_OPENVINO_PREFILL_CHUNK_SIZE"); if (chunk_size_str && atoi(chunk_size_str) > 0) { return atoi(chunk_size_str); } return 256; }; static std::string device = "NPU"; static auto is_static = true; static auto stateful = false; static auto prefill_chunk_size = get_prefill_chunk_size(); const auto & config = ggml_openvino_get_compile_config(); if (is_naive(cgraph)) { return naive_compute(cgraph, core, device, config); } auto start_time = ggml_time_us(); std::shared_ptr ggml_decoder; std::shared_ptr infer_request; ModelParams m_params; ComputeParams c_params; std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static); const auto * inp_pos = get_inp_pos_tensor(cgraph); const auto is_prefill = get_is_prefill(inp_pos); graph_key key(cgraph); bool cache_hit; int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; int64_t infer_end_time; std::shared_ptr entry; ModelParams old_m_params; { std::lock_guard map_lock(r_ctx->ctx_mutex); auto it = r_ctx->decoder_cache.find(key); cache_hit = it != r_ctx->decoder_cache.end(); if (cache_hit) { entry = it->second; } else { auto mutex = std::make_shared(); entry = std::make_shared(mutex); r_ctx->decoder_cache[key] = entry; } } std::lock_guard lock(*(entry->mutex)); if (cache_hit) { ggml_decoder = entry->ptr; old_m_params = ggml_decoder->get_model_params(); cache_hit = old_m_params.can_reuse_statically(m_params); } if (cache_hit) { std::map> model_weights; ggml_decoder->m_is_prefill = is_prefill; ggml_decoder->set_model_params(m_params); ggml_decoder->set_compute_params(c_params); if (old_m_params.kv_buffer_changed(m_params)) { ggml_decoder->update_io(cgraph); } ggml_decoder->add_extra_inputs(); { std::lock_guard map_lock(r_ctx->ctx_mutex); infer_request = is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key); } decoder_end_time = ggml_time_us(); conversion_end_time = decoder_end_time; compile_end_time = decoder_end_time; } else { { std::lock_guard map_lock(r_ctx->ctx_mutex); r_ctx->infer_request_cache.erase(key); r_ctx->infer_request_cache_prefill.erase(key); } std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); auto ggml_decoder_prefill = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, stateful, true, prefill_chunk_size); auto ggml_decoder_decode = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, stateful, false, prefill_chunk_size); decoder_end_time = ggml_time_us(); auto input_model_prefill = std::make_shared(ggml_decoder_prefill); auto input_model_decode = std::make_shared(ggml_decoder_decode); auto model_prefill = ov::frontend::ggml::FrontEnd::convert(input_model_prefill); ggml_decoder_prefill->clear_model_weights(); auto model_decode = ov::frontend::ggml::FrontEnd::convert(input_model_decode); ggml_decoder_decode->clear_model_weights(); conversion_end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; auto timestamp = (long long) ggml_time_us(); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); ov::serialize(model_prefill, timestamped_filename); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_decode_%lld.xml", timestamp); ov::serialize(model_decode, timestamped_filename); } ov::CompiledModel compiled_model_prefill; ov::CompiledModel compiled_model_decode; auto remote_context = ggml_openvino_get_remote_context(); if (remote_context.has_value()) { compiled_model_prefill = core.compile_model(model_prefill, remote_context.value(), config); compiled_model_decode = core.compile_model(model_decode, remote_context.value(), config); } else { compiled_model_prefill = core.compile_model(model_prefill, device, config); compiled_model_decode = core.compile_model(model_decode, device, config); } auto infer_request_prefill = std::make_shared(compiled_model_prefill.create_infer_request()); auto infer_request_decode = std::make_shared(compiled_model_decode.create_infer_request()); compile_end_time = ggml_time_us(); model = is_prefill ? model_prefill : model_decode; ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode; infer_request = is_prefill ? infer_request_prefill : infer_request_decode; entry->ptr = ggml_decoder; std::vector ov_input_names; std::vector ov_output_names; for (const auto & ov_param : model->get_parameters()) { ov_input_names.push_back(ov_param->get_friendly_name()); } for (const auto & ov_output : model->get_results()) { ov_output_names.push_back(ov_output->get_friendly_name()); } { std::lock_guard map_lock(r_ctx->ctx_mutex); r_ctx->infer_request_cache_prefill[key] = infer_request_prefill; r_ctx->infer_request_cache[key] = infer_request_decode; r_ctx->ov_input_names_cache[key] = std::move(ov_input_names); r_ctx->ov_output_names_cache[key] = std::move(ov_output_names); } } std::vector ov_input_names_local; std::vector ov_output_names_local; { std::lock_guard map_lock(r_ctx->ctx_mutex); ov_input_names_local = r_ctx->ov_input_names_cache[key]; ov_output_names_local = r_ctx->ov_output_names_cache[key]; } if (is_prefill) { auto inp_len = inp_pos->ne[0]; for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) { for (size_t i = 0; i < ov_input_names_local.size(); i++) { auto param_name = ov_input_names_local[i]; auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index); infer_request->set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { const auto input_tensor = infer_request->get_input_tensor(i); print_input_tensor_info(param_name, input_tensor); } } for (size_t i = 0; i < ov_output_names_local.size(); i++) { auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]); auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor); infer_request->set_output_tensor(i, output_tensor); } infer_request->infer(); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { for (size_t i = 0; i < ov_output_names_local.size(); i++) { const auto output_tensor = infer_request->get_output_tensor(i); print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data()); } } } infer_end_time = ggml_time_us(); } else { for (size_t i = 0; i < ov_input_names_local.size(); i++) { auto param_name = ov_input_names_local[i]; auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name); infer_request->set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { const auto input_tensor = infer_request->get_input_tensor(i); print_input_tensor_info(param_name, input_tensor); } } for (size_t i = 0; i < ov_output_names_local.size(); i++) { auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]); auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor); infer_request->set_output_tensor(i, output_tensor); } infer_request->infer(); infer_end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { for (size_t i = 0; i < ov_output_names_local.size(); i++) { const auto output_tensor = infer_request->get_output_tensor(i); print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data()); } } } if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("\nGGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph decoder time: %ld ms \n", (decoder_end_time - start_time) / 1000); if (!cache_hit) { GGML_LOG_INFO(" - Graph conversion time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); GGML_LOG_INFO(" - Graph compile time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); } GGML_LOG_INFO(" - Graph inference time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); } return GGML_STATUS_SUCCESS; } bool is_naive(ggml_cgraph * cgraph) { constexpr int naive_graph_size_threshold = 20; int count = 0; for (int i = 0; i < cgraph->n_nodes; i++) { if (cgraph->nodes[i]->op != GGML_OP_NONE) { count++; } } return count < naive_graph_size_threshold; } enum ggml_status naive_compute(ggml_cgraph * cgraph, ov::Core & core, const std::string & device, const ov::AnyMap & config) { if (cgraph->n_nodes == 1 && (cgraph->nodes[0]->op == GGML_OP_NONE || cgraph->nodes[0]->op == GGML_OP_VIEW)) { return GGML_STATUS_SUCCESS; } bool naive = true; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, naive); auto decoder = std::make_shared(cgraph, model_weights); auto input_model = std::make_shared(decoder); auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive); if (getenv("GGML_OPENVINO_DUMP_IR")) { ov::serialize(model, "IR_naive.xml"); } std::shared_ptr infer_request; auto remote_context = ggml_openvino_get_remote_context(); if (cgraph->nodes[0]->op == GGML_OP_MUL_MAT) { // TODO ACCURACY hint triggers a bug in GPU plugin/driver on Lunar Lake. Remove once CVS-182166 is resolved core.set_property(device, ov::hint::execution_mode(ov::hint::ExecutionMode::PERFORMANCE)); } else { core.set_property(device, ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY)); } if (remote_context.has_value()) { infer_request = std::make_shared( core.compile_model(model, remote_context.value(), config).create_infer_request()); } else { infer_request = std::make_shared(core.compile_model(model, device, config).create_infer_request()); } auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); auto input_tensor = get_ov_input_tensor(decoder, param_name); infer_request->set_input_tensor(i, input_tensor); } auto ov_results = model->get_results(); for (size_t i = 0; i < ov_results.size(); i++) { auto * ggml_tensor = decoder->get_model_outputs().at(ov_results[i]->get_friendly_name()); auto output_tensor = create_ov_output_tensor(decoder, infer_request, i, ggml_tensor); infer_request->set_output_tensor(i, output_tensor); } infer_request->infer(); return GGML_STATUS_SUCCESS; } namespace { ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name) { const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); if (ggml_tensor->extra != nullptr) { // GGML_LOG_DEBUG("Using ggml_tensor->extra as ov::Tensor for input: %s\n", name.c_str()); auto * extra_base = static_cast(ggml_tensor->extra); if (extra_base->type != ggml_openvino_extra_base::Type::TENSOR) { throw std::runtime_error("ggml tensor extra is not of type TENSOR for input: " + name); } auto * tensor_extra = static_cast(extra_base); return *tensor_extra->tensor; } // GGML_LOG_DEBUG("Converting ggml tensor to ov::Tensor for input: %s\n", name.c_str()); auto * input_data = ggml_tensor->data; ov::Shape input_shape; if (ggml_tensor->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work input_shape = ggml_decoder->get_shape(ggml_tensor->view_src); } else { input_shape = ggml_decoder->get_shape(ggml_tensor); } auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data); return input_tensor; } } // namespace ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name) { ov::Tensor input_tensor; if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); } else { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); } return input_tensor; } ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, const std::string & param_name) { // NPU decoding stage const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); if (GgmlOvDecoder::is_inp_tok(ggml_tensor, op) || GgmlOvDecoder::is_inp_pos(ggml_tensor, op) || GgmlOvDecoder::is_kv_idx(ggml_tensor, op)) { assert(ggml_tensor->ne[0] == 1); ov::Shape input_shape = {1, 1, 1, 1}; ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); if (ggml_tensor->type == GGML_TYPE_I32) { *input_tensor.data() = *((int32_t *) ggml_tensor->data); } else if (ggml_tensor->type == GGML_TYPE_I64) { *input_tensor.data() = *((int64_t *) ggml_tensor->data); } else { throw std::runtime_error("Unexpected tensor type for " + param_name); } return input_tensor; } if (GgmlOvDecoder::is_output_idx(ggml_tensor, op)) { ov::Shape input_shape = {1, 1, 1, 1}; ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); int32_t inp_out_id = *((int32_t *) ggml_tensor->data); assert(ggml_tensor->ne[0] == 1); assert(inp_out_id == 0); *input_tensor.data() = inp_out_id; return input_tensor; } if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) { size_t context_size = ggml_decoder->get_ctx_size(); std::vector padded_data = pad_input(ggml_tensor, 1, context_size, -INFINITY); ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size}); auto * data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.begin() + context_size, data_ptr); return input_tensor; } return get_ov_input_tensor(ggml_decoder, param_name); } ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggml_decoder, const std::string & param_name, int chunk_index) { // NPU prompt processing stage const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); const size_t input_len = ggml_decoder->get_input_len(); const size_t chunk_size = ggml_decoder->m_prefill_chunk_size; const size_t chunk_valid_size = std::min(chunk_size, input_len - chunk_index * chunk_size); const size_t chunk_pad_size = chunk_size - chunk_valid_size; if (GgmlOvDecoder::is_inp_tok(ggml_tensor, op) || GgmlOvDecoder::is_inp_pos(ggml_tensor, op) || GgmlOvDecoder::is_kv_idx(ggml_tensor, op)) { ov::Shape input_shape = {1, 1, 1, chunk_size}; ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); // copy the chunk_index-th chunk from ggml_tensor size_t element_size = ggml_type_size(ggml_tensor->type); void * input_data = (char *) ggml_tensor->data + chunk_index * chunk_size * element_size; std::memcpy(input_tensor.data(), input_data, chunk_valid_size * element_size); // pad the rest with last_value + 1, so that kv's of padded positions are inserted // to the next row after the valids row in the kvcache if (chunk_pad_size > 0) { if (ggml_tensor->type == GGML_TYPE_I32) { int32_t last_value = *((int32_t *) ggml_tensor->data + (chunk_index * chunk_size + chunk_valid_size - 1)); int32_t * output_data = input_tensor.data(); std::fill(output_data + chunk_valid_size, output_data + chunk_size, last_value + 1); } else if (ggml_tensor->type == GGML_TYPE_I64) { int64_t last_value = *((int64_t *) ggml_tensor->data + (chunk_index * chunk_size + chunk_valid_size - 1)); int64_t * output_data = input_tensor.data(); std::fill(output_data + chunk_valid_size, output_data + chunk_size, last_value + 1); } else { throw std::runtime_error("Unexpected tensor type for " + param_name); } } return input_tensor; } if (GgmlOvDecoder::is_output_idx(ggml_tensor, op)) { size_t output_len = ggml_decoder->get_compute_params().output_len; ov::Shape input_shape = {1, 1, 1, output_len}; ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); if (ggml_tensor->ne[0] == 0) { *input_tensor.data() = 0; } else { auto * data_addr = input_tensor.data(); for (size_t i = 0; i < output_len; i++) { data_addr[i] = ((int32_t *) ggml_tensor->data)[i] % chunk_size; } } return input_tensor; } if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) { size_t cols = ggml_tensor->ne[0]; size_t rows = ggml_tensor->ne[1]; float * ggml_data = (float *) ggml_tensor->data + chunk_index * chunk_size * cols; size_t chunk_valid_rows = std::min(chunk_size, rows - chunk_index * chunk_size); size_t context_size = ggml_decoder->get_ctx_size(); std::vector padded_data = pad_input(ggml_data, chunk_valid_rows, cols, chunk_size, context_size, -INFINITY); set_zero_diagonal(padded_data, chunk_size, context_size); ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, chunk_size, context_size}); auto * data_ptr = input_tensor.data(); std::copy(padded_data.begin(), padded_data.begin() + chunk_size * context_size, data_ptr); return input_tensor; } return get_ov_input_tensor(ggml_decoder, param_name); } size_t checksum(const void * data, size_t size) { const uint8_t * bytes = static_cast(data); size_t sum = 0; for (size_t i = 0; i < size; ++i) { sum += (uint8_t) i; sum += bytes[i]; } return sum; } void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor) { std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; switch (tensor.get_element_type()) { case ov::element::f32: { if (name.find("self_kq_mask") == std::string::npos) { std::cout << *(tensor.data()) << std::endl; } else { size_t rows = tensor.get_shape()[2]; size_t cols = tensor.get_shape()[3]; auto * data = tensor.data(); for (size_t i = 0; i < rows; ++i) { for (size_t j = 0; j < cols; ++j) { float val = data[i * cols + j]; if (std::isinf(val) && val < 0) { std::cout << std::setw(5) << "-inf"; } else { std::cout << std::setw(5) << val; } } std::cout << std::endl; } } break; } case ov::element::f16: std::cout << *(tensor.data()) << std::endl; break; case ov::element::i32: for (size_t i = 0; i < tensor.get_size(); ++i) { std::cout << tensor.data()[i] << " "; } std::cout << std::endl; break; case ov::element::i64: for (size_t i = 0; i < tensor.get_size(); ++i) { std::cout << tensor.data()[i] << " "; } std::cout << std::endl; break; default: break; } } void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, const void * output_dst) { std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() << ", Address: " << output_dst << std::endl; auto print_float_stats = [](const std::string & type_name, size_t size, auto get_value) { if (size == 0) { return; } float first = get_value(0); float min = first; float max = first; double sum = first; for (size_t i = 1; i < size; ++i) { float v = get_value(i); if (v < min) { min = v; } if (v > max) { max = v; } sum += v; } double mean = sum / size; std::cout << std::right << std::setw(6) << type_name << std::right << std::setw(12) << "First" << std::setw(12) << "Min" << std::setw(12) << "Max" << std::setw(12) << "Mean" << std::endl; std::cout << std::right << std::setw(6) << "" << std::right << std::setw(12) << first << std::setw(12) << min << std::setw(12) << max << std::setw(12) << mean << std::endl; }; switch (tensor.get_element_type()) { case ov::element::f32: { const float * data = tensor.data(); size_t size = tensor.get_size(); print_float_stats("[f32]", size, [data](size_t i) { return data[i]; }); break; } case ov::element::f16: { const ov::float16 * data = tensor.data(); size_t size = tensor.get_size(); print_float_stats("[f16]", size, [data](size_t i) { return static_cast(data[i]); }); break; } default: break; } } void set_zero_diagonal(std::vector & matrix, size_t rows, size_t cols) { for (size_t i = 0; i < rows; ++i) { size_t diag_col = std::min(i, cols - 1); matrix[i * cols + diag_col] = 0.0f; } } const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; ++i) { auto * op = cgraph->nodes[i]; for (int j = 0; j < GGML_MAX_SRC; ++j) { auto * src = op->src[j]; if (src == nullptr) { break; } if (GgmlOvDecoder::is_inp_pos(src, op)) { return src; } } } GGML_LOG_ERROR("get_inp_pos_tensor: inp_pos not found in cgraph"); throw std::runtime_error("get_inp_pos_tensor: inp_pos not found in cgraph"); } bool get_is_prefill(const ggml_tensor * inp_pos) { return inp_pos->ne[0] > 1; } #pragma GCC diagnostic pop