From 7c832a208c5b11a0bc6101aab00787a33341b742 Mon Sep 17 00:00:00 2001 From: Wei-Ming Chen Date: Wed, 23 Aug 2023 14:05:20 -0400 Subject: [PATCH] fix gpu and update demo ui with shortcut (#35) --- transformer/application/chat.cc | 38 +++++++++++++++++++++++----- transformer/download_model.py | 4 +++ transformer/include/profiler.h | 40 ++++++++++++++++++++---------- transformer/quantize_and_upload.py | 8 +++--- transformer/src/OPTGenerate.cc | 2 ++ transformer/src/ops/linear.cc | 4 +++ 6 files changed, 73 insertions(+), 23 deletions(-) diff --git a/transformer/application/chat.cc b/transformer/application/chat.cc index da8cc214..87df89fb 100644 --- a/transformer/application/chat.cc +++ b/transformer/application/chat.cc @@ -5,7 +5,7 @@ std::map model_config = { {"OPT_125m", OPT_125M}, {"OPT_1.3B", OPT_1_3B}, {"OPT_6.7B", OPT_6_7B}, {"LLaMA_7B", LLaMA_7B}, - {"LLaMA_7B_AWQ", LLaMA_7B}, {"LLaMA_7B_2_chat", LLaMA_7B}, {"LLaMA_13B_2_chat", LLaMA_13B}}; + {"LLaMA_7B_AWQ", LLaMA_7B}, {"LLaMA_7B_2_chat", LLaMA_7B}, {"7b", LLaMA_7B}, {"LLaMA_13B_2_chat", LLaMA_13B}, {"13b", LLaMA_13B}}; std::map model_path = {{"OPT_125m", "models/OPT_125m"}, {"OPT_1.3B", "models/OPT_1.3B"}, @@ -13,18 +13,22 @@ std::map model_path = {{"OPT_125m", "models/OPT_125m"} {"LLaMA_7B", "models/LLaMA_7B"}, {"LLaMA_7B_AWQ", "models/LLaMA_7B_AWQ"}, {"LLaMA_7B_2_chat", "models/LLaMA_7B_2_chat"}, - {"LLaMA_13B_2_chat", "models/LLaMA_13B_2_chat"}}; + {"7b", "models/LLaMA_7B_2_chat"}, + {"LLaMA_13B_2_chat", "models/LLaMA_13B_2_chat"}, + {"13b", "models/LLaMA_13B_2_chat"}}; std::map data_format_list = { {"FP32", FP32}, {"INT8", INT8}, {"INT4", INT4}, + {"int4", INT4}, + {"fp32", FP32}, }; bool isLLaMA(std::string s) { std::string LLaMA_prefix = "LLaMA"; - if (s.substr(0, LLaMA_prefix.size()) == LLaMA_prefix) + if (s.substr(0, LLaMA_prefix.size()) == LLaMA_prefix || s == "7b" || s == "13b") return true; else return false; @@ -33,9 +37,11 @@ bool isLLaMA(std::string s) { int main(int argc, char* argv[]) { std::string target_model = "LLaMA_7B_2_chat"; std::string target_data_format = "INT4"; + Profiler::getInstance().for_demo = true; if (argc == 3) { auto target_str = argv[1]; + target_model = argv[1]; if (model_config.count(target_model) == 0) { std::cerr << "Model config:" << target_str << " unsupported" << std::endl; std::cerr << "Please select one of the following:"; @@ -46,12 +52,11 @@ int main(int argc, char* argv[]) { throw("Unsupported model\n"); } std::cout << "Model: " << argv[1] << " selected" << std::endl; - target_model = argv[1]; auto data_format_input = argv[2]; if (data_format_list.count(data_format_input) == 0) { std::cerr << "Data format:" << data_format_input << " unsupported" << std::endl; - std::cerr << "Please select one of the following:"; + std::cerr << "Please select one of the following: "; for (const auto& k : data_format_list) { std::cerr << k.first << ", "; } @@ -60,7 +65,23 @@ int main(int argc, char* argv[]) { } std::cout << "Data format: " << argv[2] << " selected" << std::endl; target_data_format = argv[2]; - } else { + } else if (argc == 2){ + auto target_str = argv[1]; + target_model = argv[1]; + if (model_config.count(target_model) == 0) { + std::cerr << "Model config:" << target_str << " unsupported" << std::endl; + std::cerr << "Please select one of the following: "; + for (const auto& k : model_config) { + std::cerr << k.first << ", "; + } + std::cerr << std::endl; + throw("Unsupported model\n"); + } + std::cout << "Model: " << argv[1] << " selected" << std::endl; + + auto data_format_input = "INT4"; + } + else { if (isLLaMA(target_model)) { std::cout << "Using model: " + target_model << std::endl; std::cout << "Using LLaMA's default data format: " + target_data_format << std::endl; @@ -118,6 +139,10 @@ int main(int argc, char* argv[]) { std::cerr << "At this time, we only support FP32 and INT4 for LLaMA7B." << std::endl; } } else { // OPT + #ifdef QM_CUDA + printf("OPT is not supported with CUDA backend yet."); + exit(-1); + #else // Load model std::cout << "Loading model... " << std::flush; int model_id = model_config[target_model]; @@ -175,5 +200,6 @@ int main(int argc, char* argv[]) { std::vector generated_ids = OPTGenerate(&model, OPT_INT4, input_ids, generation_config, &encoder, true); } + #endif // QN_CUDA } }; diff --git a/transformer/download_model.py b/transformer/download_model.py index c8d53036..a7d5f7d1 100644 --- a/transformer/download_model.py +++ b/transformer/download_model.py @@ -90,6 +90,10 @@ "url": "https://www.dropbox.com/scl/fi/du8rfgexkk4b2xp9j6yrn/LLaMA_7B_2_chat.zip?rlkey=2nao2sh4hi3t1dhltsoae2muw&dl=1", # noqa: E501 "md5sum": "d0b1d11e498ac7d0a2e90348e946a7f5", }, + "LLaMA_13B_2_chat": { + "url": "https://www.dropbox.com/scl/fi/fes1l27b9kv4dn4h0qjzu/LLaMA_13B_2_chat.zip?rlkey=u1j2kt96xpj764zkj1v87gw6u&dl=1", # noqa: E501 + "md5sum": "802c81d86b6393aff3e93326e5b58f7f", + }, }, "INT8": { "OPT_125m": { diff --git a/transformer/include/profiler.h b/transformer/include/profiler.h index 752b05b2..7f56829f 100644 --- a/transformer/include/profiler.h +++ b/transformer/include/profiler.h @@ -5,6 +5,7 @@ class Profiler { public: + bool for_demo = false; static Profiler& getInstance() { static Profiler instance; return instance; @@ -35,20 +36,33 @@ class Profiler { } void report_internal() const { - std::cout << "Section, Total time(us), Average time(us), Count, GOPs" << std::endl; - for (const auto& entry : durations) { - std::string row; - row += entry.first + ", "; - row += std::to_string(entry.second) + ", "; - row += std::to_string(entry.second / counts.at(entry.first)) + ", "; - if (flops.count(entry.first) == 0) - row += std::to_string(counts.at(entry.first)) + ", N/A"; - else { - row += std::to_string(counts.at(entry.first)) + ", "; - // ops and microsecond - row += std::to_string((((float)flops.at(entry.first)) / (float)(entry.second)) / 1000.0); + if (for_demo){ + std::cout << "Section, Total time(s), ms/token, #tokens" << std::endl; + + for (const auto& entry : durations) { + std::string row; + std::cout << entry.first + ", "; + float s = (float)(entry.second) / 1000000; + float ts = (float)counts.at(entry.first); + printf("Total time: %.1f s, %.1f ms/token, %.1f token/s, %d tokens\n" , s, s/ts*1000, ts/s, counts.at(entry.first)); + } + } + else{ + std::cout << "Section, Total time(us), Average time(us), Count, GOPs" << std::endl; + for (const auto& entry : durations) { + std::string row; + row += entry.first + ", "; + row += std::to_string(entry.second) + ", "; + row += std::to_string(entry.second / counts.at(entry.first)) + ", "; + if (flops.count(entry.first) == 0) + row += std::to_string(counts.at(entry.first)) + ", N/A"; + else { + row += std::to_string(counts.at(entry.first)) + ", "; + // ops and microsecond + row += std::to_string((((float)flops.at(entry.first)) / (float)(entry.second)) / 1000.0); + } + std::cout << row << std::endl; } - std::cout << row << std::endl; } } diff --git a/transformer/quantize_and_upload.py b/transformer/quantize_and_upload.py index b0a34758..d7709093 100644 --- a/transformer/quantize_and_upload.py +++ b/transformer/quantize_and_upload.py @@ -11,7 +11,7 @@ from upload import subebackups -model_paths = ["models/LLaMA_7B", "models/LLaMA_7B_2_chat", "models/LLaMA_7B_AWQ", "models/LLaMA_13B_2_chat"] +model_paths = ["models/LLaMA_13B_2_chat"] quantized_dir = "INT4" db_prefix = "/MIT/transformer_assets/" @@ -38,8 +38,8 @@ def _get_parser(): parser = _get_parser() args = parser.parse_args() - if args.method not in ["QM_x86", "QM_ARM", "FP32", "INT8"]: - raise ValueError("expect method to be one of ['QM_x86', 'QM_ARM', 'FP32', 'INT8']") + if args.method not in ["QM_x86", "QM_ARM", "QM_CUDA", "FP32", "INT8"]: + raise ValueError("expect method to be one of ['QM_x86', 'QM_ARM', 'QM_CUDA', 'FP32', 'INT8']") QM_method = args.method if args.model_path: @@ -49,7 +49,7 @@ def _get_parser(): for model_path in target_paths: # quantize - if args.method in ["QM_x86", "QM_ARM"]: + if args.method in ["QM_x86", "QM_CUDA", "QM_ARM"]: out_dir = quantized_dir quantize_cmd = ( f"python model_quantizer.py --model_path {model_path} --method {QM_method} --output_path {out_dir}" diff --git a/transformer/src/OPTGenerate.cc b/transformer/src/OPTGenerate.cc index 27a31362..9861f1c9 100644 --- a/transformer/src/OPTGenerate.cc +++ b/transformer/src/OPTGenerate.cc @@ -2,6 +2,7 @@ #include "common.h" #include "utils.h" +#ifndef QM_CUDA // not support yet // OPTGenerate function std::vector OPTGenerate(void *model_ptr, int model_type, std::vector input_ids, const struct opt_params generation_config, Encoder *encoder, bool interactive) { @@ -175,3 +176,4 @@ std::vector OPTGenerate(void *model_ptr, int model_type, std::vector i return generate_ids; } +#endif \ No newline at end of file diff --git a/transformer/src/ops/linear.cc b/transformer/src/ops/linear.cc index 26b57744..2a90d165 100644 --- a/transformer/src/ops/linear.cc +++ b/transformer/src/ops/linear.cc @@ -55,12 +55,14 @@ void Linear_FP::forward(const Matrix3D &a, Matrix3D &c) { params.opt_params.num_thread = NUM_THREAD; matmul::MatmulOperator op = matmul::MatmulOperator(); + #ifndef QM_CUDA // not support yet if (this->has_bias) { params.bias.row = this->bias.m_dim_y; params.bias.column = this->bias.m_dim_z; params.bias.data_ptr = this->bias.m_data; op.mat_mul_accelerator_transposed_fastover_column_bias((const struct matmul_params *)¶ms); } else + #endif op.mat_mul_accelerator_transposed_fastover_column((const struct matmul_params *)¶ms); PROFILE_END(profile_name); @@ -199,10 +201,12 @@ void Linear_FP_int4::forward(const Matrix3D &x, Matrix3D &output) #ifdef PACK_QK params.B.int4_data_ptr = (uint8_t *)this->packed_weights; #endif +#ifndef QM_CUDA // not support yet if (!this->has_bias) params.bias.data_ptr = NULL; else params.bias.data_ptr = this->bias.m_data; +#endif op.mat_mul_accelerator_int8_int4_fast_no_offset(¶ms); #else op.mat_mul_accelerator_int4_fast_no_offset(¶ms);