From 7c832a208c5b11a0bc6101aab00787a33341b742 Mon Sep 17 00:00:00 2001
From: Wei-Ming Chen <meenchen79@gmail.com>
Date: Wed, 23 Aug 2023 14:05:20 -0400
Subject: [PATCH] fix gpu and update demo ui with shortcut (#35)

---
 transformer/application/chat.cc    | 38 +++++++++++++++++++++++-----
 transformer/download_model.py      |  4 +++
 transformer/include/profiler.h     | 40 ++++++++++++++++++++----------
 transformer/quantize_and_upload.py |  8 +++---
 transformer/src/OPTGenerate.cc     |  2 ++
 transformer/src/ops/linear.cc      |  4 +++
 6 files changed, 73 insertions(+), 23 deletions(-)

diff --git a/transformer/application/chat.cc b/transformer/application/chat.cc
index da8cc214..87df89fb 100644
--- a/transformer/application/chat.cc
+++ b/transformer/application/chat.cc
@@ -5,7 +5,7 @@
 
 std::map<std::string, int> model_config = {
     {"OPT_125m", OPT_125M},     {"OPT_1.3B", OPT_1_3B},        {"OPT_6.7B", OPT_6_7B},         {"LLaMA_7B", LLaMA_7B},
-    {"LLaMA_7B_AWQ", LLaMA_7B}, {"LLaMA_7B_2_chat", LLaMA_7B}, {"LLaMA_13B_2_chat", LLaMA_13B}};
+    {"LLaMA_7B_AWQ", LLaMA_7B}, {"LLaMA_7B_2_chat", LLaMA_7B}, {"7b", LLaMA_7B}, {"LLaMA_13B_2_chat", LLaMA_13B}, {"13b", LLaMA_13B}};
 
 std::map<std::string, std::string> model_path = {{"OPT_125m", "models/OPT_125m"},
                                                  {"OPT_1.3B", "models/OPT_1.3B"},
@@ -13,18 +13,22 @@ std::map<std::string, std::string> model_path = {{"OPT_125m", "models/OPT_125m"}
                                                  {"LLaMA_7B", "models/LLaMA_7B"},
                                                  {"LLaMA_7B_AWQ", "models/LLaMA_7B_AWQ"},
                                                  {"LLaMA_7B_2_chat", "models/LLaMA_7B_2_chat"},
-                                                 {"LLaMA_13B_2_chat", "models/LLaMA_13B_2_chat"}};
+                                                 {"7b", "models/LLaMA_7B_2_chat"},
+                                                 {"LLaMA_13B_2_chat", "models/LLaMA_13B_2_chat"},
+                                                 {"13b", "models/LLaMA_13B_2_chat"}};
 
 std::map<std::string, int> data_format_list = {
     {"FP32", FP32},
     {"INT8", INT8},
     {"INT4", INT4},
+    {"int4", INT4},
+    {"fp32", FP32},
 };
 
 bool isLLaMA(std::string s) {
     std::string LLaMA_prefix = "LLaMA";
 
-    if (s.substr(0, LLaMA_prefix.size()) == LLaMA_prefix)
+    if (s.substr(0, LLaMA_prefix.size()) == LLaMA_prefix || s == "7b" || s == "13b")
         return true;
     else
         return false;
@@ -33,9 +37,11 @@ bool isLLaMA(std::string s) {
 int main(int argc, char* argv[]) {
     std::string target_model = "LLaMA_7B_2_chat";
     std::string target_data_format = "INT4";
+    Profiler::getInstance().for_demo = true;
 
     if (argc == 3) {
         auto target_str = argv[1];
+        target_model = argv[1];
         if (model_config.count(target_model) == 0) {
             std::cerr << "Model config:" << target_str << " unsupported" << std::endl;
             std::cerr << "Please select one of the following:";
@@ -46,12 +52,11 @@ int main(int argc, char* argv[]) {
             throw("Unsupported model\n");
         }
         std::cout << "Model: " << argv[1] << " selected" << std::endl;
-        target_model = argv[1];
 
         auto data_format_input = argv[2];
         if (data_format_list.count(data_format_input) == 0) {
             std::cerr << "Data format:" << data_format_input << " unsupported" << std::endl;
-            std::cerr << "Please select one of the following:";
+            std::cerr << "Please select one of the following: ";
             for (const auto& k : data_format_list) {
                 std::cerr << k.first << ", ";
             }
@@ -60,7 +65,23 @@ int main(int argc, char* argv[]) {
         }
         std::cout << "Data format: " << argv[2] << " selected" << std::endl;
         target_data_format = argv[2];
-    } else {
+    } else if (argc == 2){
+        auto target_str = argv[1];
+        target_model = argv[1];
+        if (model_config.count(target_model) == 0) {
+            std::cerr << "Model config:" << target_str << " unsupported" << std::endl;
+            std::cerr << "Please select one of the following: ";
+            for (const auto& k : model_config) {
+                std::cerr << k.first << ", ";
+            }
+            std::cerr << std::endl;
+            throw("Unsupported model\n");
+        }
+        std::cout << "Model: " << argv[1] << " selected" << std::endl;
+
+        auto data_format_input = "INT4";
+    }
+    else {
         if (isLLaMA(target_model)) {
             std::cout << "Using model: " + target_model << std::endl;
             std::cout << "Using LLaMA's default data format: " + target_data_format << std::endl;
@@ -118,6 +139,10 @@ int main(int argc, char* argv[]) {
             std::cerr << "At this time, we only support FP32 and INT4 for LLaMA7B." << std::endl;
         }
     } else {  // OPT
+    #ifdef QM_CUDA
+        printf("OPT is not supported with CUDA backend yet.");
+        exit(-1);
+    #else
         // Load model
         std::cout << "Loading model... " << std::flush;
         int model_id = model_config[target_model];
@@ -175,5 +200,6 @@ int main(int argc, char* argv[]) {
             std::vector<int> generated_ids =
                 OPTGenerate(&model, OPT_INT4, input_ids, generation_config, &encoder, true);
         }
+    #endif // QN_CUDA
     }
 };
diff --git a/transformer/download_model.py b/transformer/download_model.py
index c8d53036..a7d5f7d1 100644
--- a/transformer/download_model.py
+++ b/transformer/download_model.py
@@ -90,6 +90,10 @@
             "url": "https://www.dropbox.com/scl/fi/du8rfgexkk4b2xp9j6yrn/LLaMA_7B_2_chat.zip?rlkey=2nao2sh4hi3t1dhltsoae2muw&dl=1",  # noqa: E501
             "md5sum": "d0b1d11e498ac7d0a2e90348e946a7f5",
         },
+        "LLaMA_13B_2_chat": {
+            "url": "https://www.dropbox.com/scl/fi/fes1l27b9kv4dn4h0qjzu/LLaMA_13B_2_chat.zip?rlkey=u1j2kt96xpj764zkj1v87gw6u&dl=1",  # noqa: E501
+            "md5sum": "802c81d86b6393aff3e93326e5b58f7f",
+        },
     },
     "INT8": {
         "OPT_125m": {
diff --git a/transformer/include/profiler.h b/transformer/include/profiler.h
index 752b05b2..7f56829f 100644
--- a/transformer/include/profiler.h
+++ b/transformer/include/profiler.h
@@ -5,6 +5,7 @@
 
 class Profiler {
    public:
+    bool for_demo = false;
     static Profiler& getInstance() {
         static Profiler instance;
         return instance;
@@ -35,20 +36,33 @@ class Profiler {
     }
 
     void report_internal() const {
-        std::cout << "Section, Total time(us), Average time(us), Count, GOPs" << std::endl;
-        for (const auto& entry : durations) {
-            std::string row;
-            row += entry.first + ", ";
-            row += std::to_string(entry.second) + ", ";
-            row += std::to_string(entry.second / counts.at(entry.first)) + ", ";
-            if (flops.count(entry.first) == 0)
-                row += std::to_string(counts.at(entry.first)) + ", N/A";
-            else {
-                row += std::to_string(counts.at(entry.first)) + ", ";
-                // ops and microsecond
-                row += std::to_string((((float)flops.at(entry.first)) / (float)(entry.second)) / 1000.0);
+        if (for_demo){
+            std::cout << "Section, Total time(s), ms/token, #tokens" << std::endl;
+
+            for (const auto& entry : durations) {
+                std::string row;
+                std::cout << entry.first + ", ";
+                float s = (float)(entry.second) / 1000000;
+                float ts = (float)counts.at(entry.first);
+                printf("Total time: %.1f s, %.1f ms/token, %.1f token/s, %d tokens\n" , s, s/ts*1000, ts/s, counts.at(entry.first));
+            }
+        }
+        else{
+            std::cout << "Section, Total time(us), Average time(us), Count, GOPs" << std::endl;
+            for (const auto& entry : durations) {
+                std::string row;
+                row += entry.first + ", ";
+                row += std::to_string(entry.second) + ", ";
+                row += std::to_string(entry.second / counts.at(entry.first)) + ", ";
+                if (flops.count(entry.first) == 0)
+                    row += std::to_string(counts.at(entry.first)) + ", N/A";
+                else {
+                    row += std::to_string(counts.at(entry.first)) + ", ";
+                    // ops and microsecond
+                    row += std::to_string((((float)flops.at(entry.first)) / (float)(entry.second)) / 1000.0);
+                }
+                std::cout << row << std::endl;
             }
-            std::cout << row << std::endl;
         }
     }
 
diff --git a/transformer/quantize_and_upload.py b/transformer/quantize_and_upload.py
index b0a34758..d7709093 100644
--- a/transformer/quantize_and_upload.py
+++ b/transformer/quantize_and_upload.py
@@ -11,7 +11,7 @@
 
 from upload import subebackups
 
-model_paths = ["models/LLaMA_7B", "models/LLaMA_7B_2_chat", "models/LLaMA_7B_AWQ", "models/LLaMA_13B_2_chat"]
+model_paths = ["models/LLaMA_13B_2_chat"]
 
 quantized_dir = "INT4"
 db_prefix = "/MIT/transformer_assets/"
@@ -38,8 +38,8 @@ def _get_parser():
     parser = _get_parser()
     args = parser.parse_args()
 
-    if args.method not in ["QM_x86", "QM_ARM", "FP32", "INT8"]:
-        raise ValueError("expect method to be one of ['QM_x86', 'QM_ARM', 'FP32', 'INT8']")
+    if args.method not in ["QM_x86", "QM_ARM", "QM_CUDA", "FP32", "INT8"]:
+        raise ValueError("expect method to be one of ['QM_x86', 'QM_ARM', 'QM_CUDA', 'FP32', 'INT8']")
     QM_method = args.method
 
     if args.model_path:
@@ -49,7 +49,7 @@ def _get_parser():
 
     for model_path in target_paths:
         # quantize
-        if args.method in ["QM_x86", "QM_ARM"]:
+        if args.method in ["QM_x86", "QM_CUDA", "QM_ARM"]:
             out_dir = quantized_dir
             quantize_cmd = (
                 f"python model_quantizer.py --model_path {model_path} --method {QM_method} --output_path {out_dir}"
diff --git a/transformer/src/OPTGenerate.cc b/transformer/src/OPTGenerate.cc
index 27a31362..9861f1c9 100644
--- a/transformer/src/OPTGenerate.cc
+++ b/transformer/src/OPTGenerate.cc
@@ -2,6 +2,7 @@
 #include "common.h"
 #include "utils.h"
 
+#ifndef QM_CUDA // not support yet
 // OPTGenerate function
 std::vector<int> OPTGenerate(void *model_ptr, int model_type, std::vector<int> input_ids,
                              const struct opt_params generation_config, Encoder *encoder, bool interactive) {
@@ -175,3 +176,4 @@ std::vector<int> OPTGenerate(void *model_ptr, int model_type, std::vector<int> i
 
     return generate_ids;
 }
+#endif
\ No newline at end of file
diff --git a/transformer/src/ops/linear.cc b/transformer/src/ops/linear.cc
index 26b57744..2a90d165 100644
--- a/transformer/src/ops/linear.cc
+++ b/transformer/src/ops/linear.cc
@@ -55,12 +55,14 @@ void Linear_FP::forward(const Matrix3D<float> &a, Matrix3D<float> &c) {
     params.opt_params.num_thread = NUM_THREAD;
 
     matmul::MatmulOperator op = matmul::MatmulOperator();
+    #ifndef QM_CUDA // not support yet
     if (this->has_bias) {
         params.bias.row = this->bias.m_dim_y;
         params.bias.column = this->bias.m_dim_z;
         params.bias.data_ptr = this->bias.m_data;
         op.mat_mul_accelerator_transposed_fastover_column_bias((const struct matmul_params *)&params);
     } else
+    #endif
         op.mat_mul_accelerator_transposed_fastover_column((const struct matmul_params *)&params);
 
     PROFILE_END(profile_name);
@@ -199,10 +201,12 @@ void Linear_FP_int4::forward(const Matrix3D<float> &x, Matrix3D<float> &output)
 #ifdef PACK_QK
     params.B.int4_data_ptr = (uint8_t *)this->packed_weights;
 #endif
+#ifndef QM_CUDA // not support yet
     if (!this->has_bias)
         params.bias.data_ptr = NULL;
     else
         params.bias.data_ptr = this->bias.m_data;
+#endif
     op.mat_mul_accelerator_int8_int4_fast_no_offset(&params);
 #else
     op.mat_mul_accelerator_int4_fast_no_offset(&params);