#include "arg.h" #include "common.h" #include "debug.h" #include "log.h" #include "llama.h" #include #include #include static bool run(llama_context * ctx, const common_params & params) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); const bool add_bos = llama_vocab_get_add_bos(vocab); std::vector tokens = common_tokenize(ctx, params.prompt, add_bos, true); if (tokens.empty()) { LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__); return false; } LOG_INF("number of input tokens = %zu\n", tokens.size()); for (size_t i = 0; i < tokens.size(); ++i) { LOG_INF(" %d\n", tokens[i]); } if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { LOG_ERR("%s : failed to eval\n", __func__); return false; } return true; } int main(int argc, char ** argv) { std::setlocale(LC_NUMERIC, "C"); common_debug_cb_user_data cb_data; common_params params; common_init(); if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } llama_backend_init(); llama_numa_init(params.numa); // pass the callback to the backend scheduler // it will be executed for each node during the graph computation params.cb_eval = common_debug_cb_eval; params.cb_eval_user_data = &cb_data; params.warmup = false; // init auto llama_init = common_init_from_params(params); auto * model = llama_init->model(); auto * ctx = llama_init->context(); if (model == nullptr || ctx == nullptr) { LOG_ERR("%s : failed to init\n", __func__); return 1; } // print system information { LOG_INF("\n"); LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("\n"); } bool OK = run(ctx, params); if (!OK) { return 1; } LOG("\n"); llama_perf_context_print(ctx); llama_backend_free(); return 0; }