Quality of life fixes for GPU users and future development (#79)

mit-han-lab · Nov 24, 2023 · 660229b · 660229b
1 parent c3b94c7
commit 660229b
Show file tree

Hide file tree

Showing 12 changed files with 226 additions and 322 deletions.
diff --git a/.gitignore b/.gitignore
@@ -27,13 +27,11 @@ test_*
 !test_*.cu
 demo
 chat
-voicechat
 profile_*
 !profile_*.cc
 libtorch/
 
 transformer/chat
-transformer/voicechat
 transformer/output.wav
 transformer/tmpfile
 transformer/TTS
diff --git a/llm/Makefile b/llm/Makefile
@@ -15,9 +15,8 @@ CXXFLAGS += $(DEFINE)
 TEST_TARGET_GENERAL = test_Int8OPTAttention test_Int8OPTDecoderLayer test_Int8OPTDecoder test_OPTForCausalLM test_OPTTokenizer test_LLaMATokenizer test_OPTGenerate test_Fp32llamaAttention test_Fp32llamaDecoderLayer test_Fp32llamaDecoder test_Fp32llamaForCausalLM test_Fp32OPTAttention test_Fp32OPTDecoderLayer test_Fp32OPTDecoder test_Fp32OPTForCausalLM
 TEST_TARGET_IF_CUDA = test_ops test_Int4llamaAttention test_Int4llamaDecoderLayer test_Int4llamaDecoder test_Int4llamaForCausalLM
 PROFILE_TARGET = profile_Fp32llamaForCausalLM profile_Int4llamaForCausalLM profile_OPTForCausalLM profile_ops
-APP_TARGET = voicechat
 CHAT_TARGET = chat
-TARGET = $(TEST_TARGET_GENERAL) $(TEST_TARGET_IF_CUDA) $(PROFILE_TARGET) $(APP_TARGET) $(CHAT_TARGET)
+TARGET = $(TEST_TARGET_GENERAL) $(TEST_TARGET_IF_CUDA) $(PROFILE_TARGET) $(CHAT_TARGET)
 
 BUILDDIR := build/transformer
 PROFILEDIR := build_profile/transformer
@@ -219,10 +218,6 @@ profile_ops: tests/non_cuda/test_ops.cc $(PROFILE_OBJS)
 	$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -DPROFILER -o $@ $^ $(LIB) $(LDFLAGS)
 endif
 
-# Rule for APP_TARGET
-$(APP_TARGET): %: application/%.cc $(OBJS)
-	$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -o $@ $^ $(LIB) $(LDFLAGS)
-
 # Rule for CHAT_TARGET
 $(CHAT_TARGET): %: application/%.cc $(OBJS)
 	$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -o $(CHATNAME) $^ $(LIB) $(LDFLAGS)

diff --git a/llm/application/README.md b/llm/application/README.md
@@ -6,7 +6,7 @@
 
 - Follow the [instructions](../../README.md) to download and deploy LLaMA2-7B-chat.
 
-- Configure whisper.cpp
+- Configure whisper.cpp. You may need to update the Makefile and ggml.h files of whisper.cpp to get it running. For related issues, please refer to the [whisper.cpp](https://github.com/ggerganov/whisper.cpp) repository.
 
   ```bash
   # Get whisper.cpp for speech recognition
@@ -33,6 +33,7 @@
 
   ```bash
     mkdir TTS
+    cd TTS
     wget https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_arm64.tar.gz
     tar -xvzf piper_arm64.tar.gz
   ```
@@ -51,9 +52,9 @@
   nano application/sts_utils/speak
   ```
 
-- Compile and start the voicechat locally.
+- Compile and start the voicechat locally. 
 
   ```bash
-  make -j voicechat
-  ./voicechat # voicechat.exe on Windows
+  make -j chat
+  ./chat -v # chat.exe -v on Windows
   ```
diff --git a/llm/application/chat.cc b/llm/application/chat.cc
@@ -1,5 +1,6 @@
 #include <iostream>
 #include <map>
+#include <string>
 #include <cstring>
 
 #include "Generate.h"
@@ -73,13 +74,27 @@ bool convertToBool(const char* str) {
 int NUM_THREAD = 8;
 
 int main(int argc, char* argv[]) {
+    bool use_voicechat = false;
+
+    // Check for optional arguments
+    for (int i = 1; i < argc; ++i) {
+        if (strcmp(argv[i], "-v") == 0) {
+            use_voicechat = true;
+            // Remove the flag from argc and argv
+            for (int j = i; j < argc - 1; ++j) {
+                argv[j] = argv[j + 1];
+            }
+            --argc;
+            break;
+        }
+    }
+
     std::string target_model = "LLaMA2_7B_chat";
     std::string target_data_format = "INT4";
     bool instruct = true;
     Profiler::getInstance().for_demo = true;
 
     std::cout << "TinyChatEngine by MIT HAN Lab: https://github.com/mit-han-lab/TinyChatEngine" << std::endl;
-
     if (argc >= 3 && argc <= 5) {
         if (argc >= 4) {
             NUM_THREAD = atoi(argv[3]);
@@ -185,9 +200,20 @@ int main(int argc, char* argv[]) {
 
             // Get input from the user
             while (true) {
-                std::cout << "USER: ";
                 std::string input;
-                std::getline(std::cin, input);
+                if (use_voicechat){
+                    int result = std::system("./application/sts_utils/listen");
+                    std::ifstream in("tmpfile");
+                    std::getline(in, input);
+                    result = std::system("rm tmpfile");
+                    (void)result;
+                    std::cout << input << std::endl;
+                } else {
+                    std::cout << "USER: ";
+                    std::getline(std::cin, input);
+                }
+                if (input == "quit" || input == "Quit" || input == "Quit." || input == "quit.")
+                    break;
                 if (instruct) {
                     std::cout << "ASSISTANT: " << std::endl;
                     if (isCodeLLaMA(target_model)) {
@@ -223,12 +249,23 @@ int main(int argc, char* argv[]) {
             m_path = "INT4/" + m_path;
             Int4LlamaForCausalLM model = Int4LlamaForCausalLM(m_path, get_opt_model_config(model_id));
             std::cout << "Finished!" << std::endl;
-
+            
             // Get input from the user
             while (true) {
-                std::cout << "USER: ";
                 std::string input;
-                std::getline(std::cin, input);
+                if (use_voicechat){
+                    int result = std::system("./application/sts_utils/listen");
+                    std::ifstream in("tmpfile");
+                    std::getline(in, input);
+                    result = std::system("rm tmpfile");
+                    (void)result;
+                    std::cout << input << std::endl;
+                } else {
+                    std::cout << "USER: ";
+                    std::getline(std::cin, input);
+                }
+                if (input == "quit" || input == "Quit" || input == "Quit." || input == "quit.")
+                    break;
                 if (instruct) {
                     std::cout << "ASSISTANT: " << std::endl;
                     if (isCodeLLaMA(target_model)) {
@@ -256,8 +293,7 @@ int main(int argc, char* argv[]) {
                         input = "### Human: " + input + "\n### Assistant: \n";
                     }
                 }
-
-                LLaMAGenerate(m_path, &model, LLaMA_INT4, input, generation_config, "models/llama_vocab.bin", true, false);
+                LLaMAGenerate(m_path, &model, LLaMA_INT4, input, generation_config, "models/llama_vocab.bin", true, use_voicechat);
             }
         } else {
             std::cout << std::endl;
@@ -293,7 +329,7 @@ int main(int argc, char* argv[]) {
                 std::getline(std::cin, input);
                 std::cout << input;
 
-                GPTBigCodeGenerate(m_path, &model, StarCoder_FP32, input, generation_config, "models/starcoder_vocab.bin", true, false);
+                GPTBigCodeGenerate(m_path, &model, StarCoder_FP32, input, generation_config, "models/starcoder_vocab.bin", true);
             }
         } else if (format_id == INT4) {
             m_path = "INT4/" + m_path;
@@ -307,7 +343,7 @@ int main(int argc, char* argv[]) {
                 std::getline(std::cin, input);
                 std::cout << input;
 
-                GPTBigCodeGenerate(m_path, &model, StarCoder_INT4, input, generation_config, "models/starcoder_vocab.bin", true, false);    
+                GPTBigCodeGenerate(m_path, &model, StarCoder_INT4, input, generation_config, "models/starcoder_vocab.bin", true);    
             }
         } else {
             std::cout << std::endl;
@@ -335,45 +371,73 @@ int main(int argc, char* argv[]) {
         if (format_id == QINT8) {
             OPTForCausalLM model = OPTForCausalLM("INT8/" + m_path, get_opt_model_config(model_id));
             std::cout << "Finished!" << std::endl;
-
+            
             // Get input from the user
-            std::cout << "USER: ";
             std::string input;
-            std::getline(std::cin, input);
+            if (use_voicechat){
+                int result = std::system("./application/sts_utils/listen");
+                std::ifstream in("tmpfile");
+                std::getline(in, input);
+                result = std::system("rm tmpfile");
+                (void)result;
+                std::cout << input << std::endl;
+            } else {
+                std::cout << "USER: ";
+                std::getline(std::cin, input);
+            }
             std::vector<int> input_ids = encoder.encode(input);
             std::string decoded = encoder.decode(input_ids);
 
             // Generate
             std::vector<int> generated_ids =
-                OPTGenerate(&model, OPT_INT8, input_ids, generation_config, &encoder, true, false);
+                OPTGenerate(&model, OPT_INT8, input_ids, generation_config, &encoder, true, use_voicechat);
         } else if (format_id == FP32) {
             Fp32OPTForCausalLM model = Fp32OPTForCausalLM(m_path, get_opt_model_config(model_id));
             std::cout << "Finished!" << std::endl;
 
             // Get input from the user
-            std::cout << "USER: ";
             std::string input;
-            std::getline(std::cin, input);
+            if (use_voicechat){
+                int result = std::system("./application/sts_utils/listen");
+                std::ifstream in("tmpfile");
+                std::getline(in, input);
+                result = std::system("rm tmpfile");
+                (void)result;
+                std::cout << input << std::endl;
+            } else {
+                std::cout << "USER: ";
+                std::getline(std::cin, input);
+            }
             std::vector<int> input_ids = encoder.encode(input);
             std::string decoded = encoder.decode(input_ids);
 
             // Generate
             std::vector<int> generated_ids =
-                OPTGenerate(&model, OPT_FP32, input_ids, generation_config, &encoder, true, false);
+                OPTGenerate(&model, OPT_FP32, input_ids, generation_config, &encoder, true, use_voicechat);
         } else if (format_id == INT4) {
             Int4OPTForCausalLM model = Int4OPTForCausalLM("INT4/" + m_path, get_opt_model_config(model_id));
             std::cout << "Finished!" << std::endl;
 
             // Get input from the user
-            std::cout << "USER: ";
             std::string input;
-            std::getline(std::cin, input);
+            if (use_voicechat){
+                int result = std::system("./application/sts_utils/listen");
+                std::ifstream in("tmpfile");
+                std::getline(in, input);
+                result = std::system("rm tmpfile");
+                (void)result;
+                std::cout << input << std::endl;
+            } else {
+                std::cout << "USER: ";
+                std::getline(std::cin, input);
+            }
+
             std::vector<int> input_ids = encoder.encode(input);
             std::string decoded = encoder.decode(input_ids);
 
             // Generate
             std::vector<int> generated_ids =
-                OPTGenerate(&model, OPT_INT4, input_ids, generation_config, &encoder, true, false);
+                OPTGenerate(&model, OPT_INT4, input_ids, generation_config, &encoder, true, use_voicechat);
         }
 #endif  // QN_CUDA
     }

diff --git a/llm/application/sts_utils/clean_up.patch b/llm/application/sts_utils/clean_up.patch
@@ -52,10 +52,26 @@ index c598633..b342f16 100644
 
      m_sample_rate = capture_spec_obtained.freq;
 diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
-index 4c7f7d1..53304a6 100644
+index 4c7f7d1..60845f4 100644
 --- a/examples/stream/stream.cpp
 +++ b/examples/stream/stream.cpp
-@@ -171,7 +171,7 @@ int main(int argc, char ** argv) {
+@@ -139,10 +139,15 @@ int main(int argc, char ** argv) {
+
+     const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
+
++    if (use_vad){
++        fprintf(stderr, "USER: ");
++    }
++
+     params.no_timestamps  = !use_vad;
+     params.no_context    |= use_vad;
+     params.max_tokens     = 0;
+
++
+     // init audio
+
+     audio_async audio(params.length_ms);
+@@ -171,7 +176,7 @@ int main(int argc, char ** argv) {
 
      // print some info about the processing
      {
@@ -64,7 +80,7 @@ index 4c7f7d1..53304a6 100644
          if (!whisper_is_multilingual(ctx)) {
              if (params.language != "en" || params.translate) {
                  params.language = "en";
-@@ -179,24 +179,23 @@ int main(int argc, char ** argv) {
+@@ -179,24 +184,21 @@ int main(int argc, char ** argv) {
                  fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
              }
          }
@@ -91,17 +107,17 @@ index 4c7f7d1..53304a6 100644
 
          if (!use_vad) {
              fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
-         } else {
+-        } else {
 -            fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
-+            fprintf(stderr, "USER: ");
-         }
+-        }
 -
 -        fprintf(stderr, "\n");
++        } 
 +        // fprintf(stderr, "\n");
      }
 
      int n_iter = 0;
-@@ -211,11 +210,9 @@ int main(int argc, char ** argv) {
+@@ -211,11 +213,9 @@ int main(int argc, char ** argv) {
              return 1;
          }
      }
@@ -114,7 +130,7 @@ index 4c7f7d1..53304a6 100644
      const auto t_start = t_last;
 
      // main audio loop
-@@ -329,10 +326,6 @@ int main(int argc, char ** argv) {
+@@ -329,10 +329,6 @@ int main(int argc, char ** argv) {
                  } else {
                      const int64_t t1 = (t_last - t_start).count()/1000000;
                      const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
@@ -125,7 +141,7 @@ index 4c7f7d1..53304a6 100644
                  }
 
                  const int n_segments = whisper_full_n_segments(ctx);
-@@ -349,20 +342,10 @@ int main(int argc, char ** argv) {
+@@ -349,20 +345,11 @@ int main(int argc, char ** argv) {
                      } else {
                          const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
                          const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
@@ -140,6 +156,7 @@ index 4c7f7d1..53304a6 100644
 -
 -                        printf("%s", output.c_str());
 -                        fflush(stdout);
++                        text += 1;
 +                        printf ("%s\n", text);
 
                          if (params.fname_out.length() > 0) {
@@ -148,7 +165,7 @@ index 4c7f7d1..53304a6 100644
                          }
                      }
                  }
-@@ -372,8 +355,7 @@ int main(int argc, char ** argv) {
+@@ -372,8 +359,7 @@ int main(int argc, char ** argv) {
                  }
 
                  if (use_vad){
@@ -158,6 +175,24 @@ index 4c7f7d1..53304a6 100644
                  }
              }
 
+diff --git a/ggml-cuda.cu b/ggml-cuda.cu
+index 50df20e..2ebef36 100644
+--- a/ggml-cuda.cu
++++ b/ggml-cuda.cu
+@@ -1835,11 +1835,11 @@ void ggml_init_cublas() {
+         CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
+         GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
+         int64_t total_vram = 0;
+-        fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
++        // fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
+         for (int id = 0; id < g_device_count; ++id) {
+             cudaDeviceProp prop;
+             CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
+-            fprintf(stderr, "  Device %d: %s\n", id, prop.name);
++            // fprintf(stderr, "  Device %d: %s\n", id, prop.name);
+             g_tensor_split[id] = total_vram;
+             total_vram += prop.totalGlobalMem;
+         }
 diff --git a/whisper.cpp b/whisper.cpp
 index 9923fa0..bcfc5d9 100644
 --- a/whisper.cpp

diff --git a/llm/application/sts_utils/listen b/llm/application/sts_utils/listen
@@ -25,4 +25,4 @@ options:
   -tdrz,     --tinydiarize  [false  ] enable tinydiarize (requires a tdrz model)
 comm
 
-./whisper.cpp/stream -m ./whisper.cpp/models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.7 -c 1 > tmpfile
+./whisper.cpp/stream -m ./whisper.cpp/models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6 -c 1 > tmpfile