[update] update llm & vlm

LittleMouse · LittleMouse · commit 2de874ce577a · 2025-09-03T19:05:07.000+08:00
diff --git a/projects/llm_framework/main_llm/src/main.cpp b/projects/llm_framework/main_llm/src/main.cpp
@@ -29,7 +29,7 @@ using namespace StackFlows;
 int main_exit_flage = 0;
 static void __sigint(int iSigNo)
 {
-    SLOGW("llm_sys will be exit!");
+    SLOGW("llm_llm will be exit!");
     main_exit_flage = 1;
 }
 
@@ -130,7 +130,7 @@ class llm_task {
             std::string base_model = base_model_path_ + model_ + "/";
             SLOGI("base_model %s", base_model.c_str());
 
-            CONFIG_AUTO_SET(file_body["mode_param"], system_prompt);   
+            CONFIG_AUTO_SET(file_body["mode_param"], system_prompt);
             CONFIG_AUTO_SET(file_body["mode_param"], tokenizer_type);
             CONFIG_AUTO_SET(file_body["mode_param"], filename_tokenizer_model);
             CONFIG_AUTO_SET(file_body["mode_param"], url_tokenizer_model);
@@ -325,6 +325,14 @@ class llm_task {
             }
 
             if (lLaMa_ctx_) {
+                if (msg == "reset") {
+                    lLaMa_ctx_->SetSystemPrompt(mode_config_.system_prompt, _token_ids);
+                    lLaMa_ctx_->GenerateKVCachePrefill(_token_ids, k_caches, v_caches, precompute_len);
+                    last_reply.clear();
+                    if (out_callback_) out_callback_("Context has been reset.", true);
+                    return;
+                }
+
                 lLaMa_ctx_->Encode(prompt_data, prompt_complete(msg), last_reply, tokens_ids, tokens_diff);
                 if (auto ret = lLaMa_ctx_->SetKVCache(k_caches, v_caches, precompute_len, tokens_diff.size());
                     ret != 0) {
diff --git a/projects/llm_framework/main_vlm/src/main.cpp b/projects/llm_framework/main_vlm/src/main.cpp
@@ -58,6 +58,8 @@ class llm_task {
     std::vector<std::string> inputs_;
     std::vector<unsigned short> prompt_data_;
     std::vector<unsigned char> image_data_;
+    std::vector<std::vector<unsigned char>> images_data;
+    std::vector<cv::Mat> mats;
     std::vector<unsigned short> img_embed;
     std::string prompt_;
     std::string last_reply;
@@ -134,8 +136,8 @@ class llm_task {
             CONFIG_AUTO_SET(file_body["mode_param"], url_tokenizer_model);
             CONFIG_AUTO_SET(file_body["mode_param"], filename_tokens_embed);
             CONFIG_AUTO_SET(file_body["mode_param"], filename_post_axmodel);
-            CONFIG_AUTO_SET(file_body["mode_param"], filename_vpm_resampler_axmodedl);
-            CONFIG_AUTO_SET(file_body["mode_param"], filename_image_encoder_axmodedl);
+            CONFIG_AUTO_SET(file_body["mode_param"], filename_vpm_resampler_axmodel);
+            CONFIG_AUTO_SET(file_body["mode_param"], filename_image_encoder_axmodel);
             CONFIG_AUTO_SET(file_body["mode_param"], template_filename_axmodel);
             CONFIG_AUTO_SET(file_body["mode_param"], b_use_topk);
             CONFIG_AUTO_SET(file_body["mode_param"], b_vpm_two_stage);
@@ -215,11 +217,11 @@ class llm_task {
                     SLOGE("filename_tokenizer_model: %s", mode_config_.filename_tokenizer_model.c_str());
                 }
             }
-            mode_config_.filename_tokens_embed           = base_model + mode_config_.filename_tokens_embed;
-            mode_config_.filename_post_axmodel           = base_model + mode_config_.filename_post_axmodel;
-            mode_config_.template_filename_axmodel       = base_model + mode_config_.template_filename_axmodel;
-            mode_config_.filename_vpm_resampler_axmodedl = base_model + mode_config_.filename_vpm_resampler_axmodedl;
-            mode_config_.filename_image_encoder_axmodedl = base_model + mode_config_.filename_image_encoder_axmodedl;
+            mode_config_.filename_tokens_embed          = base_model + mode_config_.filename_tokens_embed;
+            mode_config_.filename_post_axmodel          = base_model + mode_config_.filename_post_axmodel;
+            mode_config_.template_filename_axmodel      = base_model + mode_config_.template_filename_axmodel;
+            mode_config_.filename_vpm_resampler_axmodel = base_model + mode_config_.filename_vpm_resampler_axmodel;
+            mode_config_.filename_image_encoder_axmodel = base_model + mode_config_.filename_image_encoder_axmodel;
             mode_config_.runing_callback = [this](int *p_token, int n_token, const char *p_str, float token_per_sec,
                                                   void *reserve) {
                 if (this->out_callback_) {
@@ -342,17 +344,38 @@ class llm_task {
             }
 
             if (lLaMa_ctx_) {
+                if (msg == "reset") {
+                    lLaMa_ctx_->SetSystemPrompt(mode_config_.system_prompt, _token_ids);
+                    lLaMa_ctx_->GenerateKVCachePrefill(_token_ids, k_caches, v_caches, precompute_len);
+                    last_reply.clear();
+                    mats.clear();
+                    if (out_callback_) out_callback_("Context has been reset.", true);
+                    return;
+                }
+
                 if (image_data_.empty()) {
                     lLaMa_ctx_->Encode(prompt_data_, prompt_complete(msg), last_reply, tokens_ids, tokens_diff);
                     if (auto ret = lLaMa_ctx_->SetKVCache(k_caches, v_caches, precompute_len, tokens_diff.size());
                         ret != 0) {
-                        ALOGE("SetKVCache failed: %d,the context may be full,input \"reset\" to reset context", ret);
-                        return;
+                        ALOGW("The context full,Reset context");
+                        lLaMa_ctx_->SetSystemPrompt(mode_config_.system_prompt, _token_ids);
+                        lLaMa_ctx_->GenerateKVCachePrefill(_token_ids, k_caches, v_caches, precompute_len);
+                        lLaMa_ctx_->SetKVCache(k_caches, v_caches, precompute_len, tokens_diff.size());
                     }
                     last_reply = lLaMa_ctx_->Run(prompt_data_);
                     lLaMa_ctx_->GetKVCache(k_caches, v_caches, precompute_len);
                     if (out_callback_) out_callback_(last_reply, true);
                 } else {
+                    for (const auto &img_buf : images_data) {
+                        cv::Mat src = cv::imdecode(img_buf, cv::IMREAD_COLOR);
+                        if (src.empty()) {
+                            std::cerr << "Decode failed!" << std::endl;
+                            continue;
+                        }
+                        mats.push_back(src);
+                    }
+                    if (mats.empty()) return;
+                    images_data.clear();
                     cv::Mat src = cv::imdecode(image_data_, cv::IMREAD_COLOR);
                     if (src.empty()) return;
                     image_data_.clear();
@@ -361,6 +384,7 @@ class llm_task {
                         ALOGE("lLaMaCtx.Encode failed");
                         return;
                     }
+                    mats.clear();
                     if (auto ret =
                             lLaMa_ctx_->Encode(img_embed, prompt_data_, prompt_complete(msg), tokens_ids, tokens_diff);
                         ret != 0) {
@@ -369,8 +393,11 @@ class llm_task {
                     }
                     if (auto ret = lLaMa_ctx_->SetKVCache(k_caches, v_caches, precompute_len, tokens_diff.size());
                         ret != 0) {
-                        ALOGE("SetKVCache failed: %d,the context may be full,input \"reset\" to reset context", ret);
-                        return;
+                        ALOGW("The context full,Reset context");
+                        lLaMa_ctx_->SetSystemPrompt(mode_config_.system_prompt, _token_ids);
+                        lLaMa_ctx_->GenerateKVCachePrefill(_token_ids, k_caches, v_caches, precompute_len);
+                        lLaMa_ctx_->SetKVCache(k_caches, v_caches, precompute_len, tokens_diff.size());
+                        lLaMa_ctx_->ClearImgsEmbed();
                     }
                     last_reply = lLaMa_ctx_->Run(prompt_data_);
                     lLaMa_ctx_->GetKVCache(k_caches, v_caches, precompute_len);
@@ -549,6 +576,7 @@ class llm_vlm : public StackFlow {
         }
         if (object.find("jpeg") != std::string::npos) {
             llm_task_obj->image_data_.assign(next_data->begin(), next_data->end());
+            llm_task_obj->images_data.emplace_back(next_data->begin(), next_data->end());
             return;
         }
         llm_task_obj->inference((*next_data));
diff --git a/projects/llm_framework/main_vlm/src/runner/LLM.hpp b/projects/llm_framework/main_vlm/src/runner/LLM.hpp
@@ -23,10 +23,10 @@ struct LLMAttrType {
     std::string template_filename_axmodel = "tinyllama-int8/tinyllama_l%d.axmodel";
     int axmodel_num                       = 22;
 
-    std::string filename_post_axmodel           = "tinyllama-int8/tinyllama_post.axmodel";
-    std::string filename_image_encoder_axmodedl = "minicpmv/vpm_resampler_version0_fp16.axmodel";
-    std::string filename_vpm_encoder_axmodedl   = "minicpmv/vpm_resampler_version0_fp16.axmodel";
-    std::string filename_vpm_resampler_axmodedl = "minicpmv/vpm_resampler_version0_fp16.axmodel";
+    std::string filename_post_axmodel          = "tinyllama-int8/tinyllama_post.axmodel";
+    std::string filename_image_encoder_axmodel = "minicpmv/vpm_resampler_version0_fp16.axmodel";
+    std::string filename_vpm_encoder_axmodel   = "minicpmv/vpm_resampler_version0_fp16.axmodel";
+    std::string filename_vpm_resampler_axmodel = "minicpmv/vpm_resampler_version0_fp16.axmodel";
 
     int image_encoder_width  = 448;
     int image_encoder_height = 448;
@@ -184,24 +184,24 @@ class LLM {
         update_cqdm(&cqdm, attr.axmodel_num + 2, "count", axmodel_path);
 
         if (_attr.b_vpm_two_stage) {
-            ret = vpm_encoder.init(attr.filename_vpm_encoder_axmodedl.c_str(), false);
+            ret = vpm_encoder.init(attr.filename_vpm_encoder_axmodel.c_str(), false);
             if (ret != 0) {
-                ALOGE("init vpm axmodel(%s) failed", attr.filename_vpm_encoder_axmodedl.c_str());
+                ALOGE("init vpm axmodel(%s) failed", attr.filename_vpm_encoder_axmodel.c_str());
                 return false;
             }
 
-            ret = vpm_resampler.init(attr.filename_vpm_resampler_axmodedl.c_str(), false);
+            ret = vpm_resampler.init(attr.filename_vpm_resampler_axmodel.c_str(), false);
             if (ret != 0) {
-                ALOGE("init vpm axmodel(%s) failed", attr.filename_vpm_resampler_axmodedl.c_str());
+                ALOGE("init vpm axmodel(%s) failed", attr.filename_vpm_resampler_axmodel.c_str());
                 return false;
             }
 
             _attr.vpm_height = vpm_encoder.get_input(0).vShape[1];
             _attr.vpm_width  = vpm_encoder.get_input(0).vShape[2];
         } else {
-            ret = vpm_resampler.init(attr.filename_vpm_resampler_axmodedl.c_str(), false);
+            ret = vpm_resampler.init(attr.filename_vpm_resampler_axmodel.c_str(), false);
             if (ret != 0) {
-                ALOGE("init vpm axmodel(%s) failed", attr.filename_vpm_resampler_axmodedl.c_str());
+                ALOGE("init vpm axmodel(%s) failed", attr.filename_vpm_resampler_axmodel.c_str());
                 return false;
             }
             _attr.vpm_height = vpm_resampler.get_input(0).vShape[1];
@@ -637,6 +637,7 @@ class LLM_CTX {
 private:
     std::shared_ptr<BaseTokenizer> tokenizer;
     LLaMaEmbedSelector embed_selector;
+    std::vector<std::vector<unsigned short>> imgs_embed_;
 
     LLMAttrType _attr;
 
@@ -718,9 +719,9 @@ class LLM_CTX {
         sprintf(axmodel_path, "init post axmodel ok,remain_cmm(%d MB)", remain_cmm);
         update_cqdm(&cqdm, attr.axmodel_num + 2, "count", axmodel_path);
 
-        ret = image_encoder.init(attr.filename_image_encoder_axmodedl.c_str());
+        ret = image_encoder.init(attr.filename_image_encoder_axmodel.c_str());
         if (ret != 0) {
-            ALOGE("init vpm axmodel(%s) failed", attr.filename_image_encoder_axmodedl.c_str());
+            ALOGE("init vpm axmodel(%s) failed", attr.filename_image_encoder_axmodel.c_str());
             return false;
         }
 
@@ -1424,8 +1425,9 @@ class LLM_CTX {
     int Encode(std::vector<unsigned short> &img_embed, std::vector<unsigned short> &out_embed, std::string prompt,
                std::vector<int> &tokens_ids, std::vector<int> &tokens_diff)
     {
-        std::vector<std::vector<unsigned short>> imgs_embed = {img_embed};
-        return Encode(imgs_embed, out_embed, prompt, tokens_ids, tokens_diff);
+        // std::vector<std::vector<unsigned short>> imgs_embed = {img_embed};
+        imgs_embed_.push_back(img_embed);
+        return Encode(imgs_embed_, out_embed, prompt, tokens_ids, tokens_diff);
     }
 
     int Encode(std::vector<unsigned short> &out_embed, std::string prompt, std::string last_reply,
@@ -1447,7 +1449,12 @@ class LLM_CTX {
         return 0;
     }
 
-    std::string Run(std::vector<unsigned short> test_embed)
+    void ClearImgsEmbed()
+    {
+        imgs_embed_.clear();
+    }
+
+    std::string Run(std::vector<unsigned short> &test_embed)
     {
         b_stop = false;
         std::string final_out;