Skip to content

Commit

Permalink
Quality of life fixes for GPU users and future development (#79)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jiminator authored Nov 24, 2023
1 parent c3b94c7 commit 660229b
Show file tree
Hide file tree
Showing 12 changed files with 226 additions and 322 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,11 @@ test_*
!test_*.cu
demo
chat
voicechat
profile_*
!profile_*.cc
libtorch/

transformer/chat
transformer/voicechat
transformer/output.wav
transformer/tmpfile
transformer/TTS
7 changes: 1 addition & 6 deletions llm/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@ CXXFLAGS += $(DEFINE)
TEST_TARGET_GENERAL = test_Int8OPTAttention test_Int8OPTDecoderLayer test_Int8OPTDecoder test_OPTForCausalLM test_OPTTokenizer test_LLaMATokenizer test_OPTGenerate test_Fp32llamaAttention test_Fp32llamaDecoderLayer test_Fp32llamaDecoder test_Fp32llamaForCausalLM test_Fp32OPTAttention test_Fp32OPTDecoderLayer test_Fp32OPTDecoder test_Fp32OPTForCausalLM
TEST_TARGET_IF_CUDA = test_ops test_Int4llamaAttention test_Int4llamaDecoderLayer test_Int4llamaDecoder test_Int4llamaForCausalLM
PROFILE_TARGET = profile_Fp32llamaForCausalLM profile_Int4llamaForCausalLM profile_OPTForCausalLM profile_ops
APP_TARGET = voicechat
CHAT_TARGET = chat
TARGET = $(TEST_TARGET_GENERAL) $(TEST_TARGET_IF_CUDA) $(PROFILE_TARGET) $(APP_TARGET) $(CHAT_TARGET)
TARGET = $(TEST_TARGET_GENERAL) $(TEST_TARGET_IF_CUDA) $(PROFILE_TARGET) $(CHAT_TARGET)

BUILDDIR := build/transformer
PROFILEDIR := build_profile/transformer
Expand Down Expand Up @@ -219,10 +218,6 @@ profile_ops: tests/non_cuda/test_ops.cc $(PROFILE_OBJS)
$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -DPROFILER -o $@ $^ $(LIB) $(LDFLAGS)
endif

# Rule for APP_TARGET
$(APP_TARGET): %: application/%.cc $(OBJS)
$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -o $@ $^ $(LIB) $(LDFLAGS)

# Rule for CHAT_TARGET
$(CHAT_TARGET): %: application/%.cc $(OBJS)
$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -o $(CHATNAME) $^ $(LIB) $(LDFLAGS)
Expand Down
9 changes: 5 additions & 4 deletions llm/application/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

- Follow the [instructions](../../README.md) to download and deploy LLaMA2-7B-chat.

- Configure whisper.cpp
- Configure whisper.cpp. You may need to update the Makefile and ggml.h files of whisper.cpp to get it running. For related issues, please refer to the [whisper.cpp](https://github.com/ggerganov/whisper.cpp) repository.

```bash
# Get whisper.cpp for speech recognition
Expand All @@ -33,6 +33,7 @@

```bash
mkdir TTS
cd TTS
wget https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_arm64.tar.gz
tar -xvzf piper_arm64.tar.gz
```
Expand All @@ -51,9 +52,9 @@
nano application/sts_utils/speak
```

- Compile and start the voicechat locally.
- Compile and start the voicechat locally.

```bash
make -j voicechat
./voicechat # voicechat.exe on Windows
make -j chat
./chat -v # chat.exe -v on Windows
```
104 changes: 84 additions & 20 deletions llm/application/chat.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <iostream>
#include <map>
#include <string>
#include <cstring>

#include "Generate.h"
Expand Down Expand Up @@ -73,13 +74,27 @@ bool convertToBool(const char* str) {
int NUM_THREAD = 8;

int main(int argc, char* argv[]) {
bool use_voicechat = false;

// Check for optional arguments
for (int i = 1; i < argc; ++i) {
if (strcmp(argv[i], "-v") == 0) {
use_voicechat = true;
// Remove the flag from argc and argv
for (int j = i; j < argc - 1; ++j) {
argv[j] = argv[j + 1];
}
--argc;
break;
}
}

std::string target_model = "LLaMA2_7B_chat";
std::string target_data_format = "INT4";
bool instruct = true;
Profiler::getInstance().for_demo = true;

std::cout << "TinyChatEngine by MIT HAN Lab: https://github.com/mit-han-lab/TinyChatEngine" << std::endl;

if (argc >= 3 && argc <= 5) {
if (argc >= 4) {
NUM_THREAD = atoi(argv[3]);
Expand Down Expand Up @@ -185,9 +200,20 @@ int main(int argc, char* argv[]) {

// Get input from the user
while (true) {
std::cout << "USER: ";
std::string input;
std::getline(std::cin, input);
if (use_voicechat){
int result = std::system("./application/sts_utils/listen");
std::ifstream in("tmpfile");
std::getline(in, input);
result = std::system("rm tmpfile");
(void)result;
std::cout << input << std::endl;
} else {
std::cout << "USER: ";
std::getline(std::cin, input);
}
if (input == "quit" || input == "Quit" || input == "Quit." || input == "quit.")
break;
if (instruct) {
std::cout << "ASSISTANT: " << std::endl;
if (isCodeLLaMA(target_model)) {
Expand Down Expand Up @@ -223,12 +249,23 @@ int main(int argc, char* argv[]) {
m_path = "INT4/" + m_path;
Int4LlamaForCausalLM model = Int4LlamaForCausalLM(m_path, get_opt_model_config(model_id));
std::cout << "Finished!" << std::endl;

// Get input from the user
while (true) {
std::cout << "USER: ";
std::string input;
std::getline(std::cin, input);
if (use_voicechat){
int result = std::system("./application/sts_utils/listen");
std::ifstream in("tmpfile");
std::getline(in, input);
result = std::system("rm tmpfile");
(void)result;
std::cout << input << std::endl;
} else {
std::cout << "USER: ";
std::getline(std::cin, input);
}
if (input == "quit" || input == "Quit" || input == "Quit." || input == "quit.")
break;
if (instruct) {
std::cout << "ASSISTANT: " << std::endl;
if (isCodeLLaMA(target_model)) {
Expand Down Expand Up @@ -256,8 +293,7 @@ int main(int argc, char* argv[]) {
input = "### Human: " + input + "\n### Assistant: \n";
}
}

LLaMAGenerate(m_path, &model, LLaMA_INT4, input, generation_config, "models/llama_vocab.bin", true, false);
LLaMAGenerate(m_path, &model, LLaMA_INT4, input, generation_config, "models/llama_vocab.bin", true, use_voicechat);
}
} else {
std::cout << std::endl;
Expand Down Expand Up @@ -293,7 +329,7 @@ int main(int argc, char* argv[]) {
std::getline(std::cin, input);
std::cout << input;

GPTBigCodeGenerate(m_path, &model, StarCoder_FP32, input, generation_config, "models/starcoder_vocab.bin", true, false);
GPTBigCodeGenerate(m_path, &model, StarCoder_FP32, input, generation_config, "models/starcoder_vocab.bin", true);
}
} else if (format_id == INT4) {
m_path = "INT4/" + m_path;
Expand All @@ -307,7 +343,7 @@ int main(int argc, char* argv[]) {
std::getline(std::cin, input);
std::cout << input;

GPTBigCodeGenerate(m_path, &model, StarCoder_INT4, input, generation_config, "models/starcoder_vocab.bin", true, false);
GPTBigCodeGenerate(m_path, &model, StarCoder_INT4, input, generation_config, "models/starcoder_vocab.bin", true);
}
} else {
std::cout << std::endl;
Expand Down Expand Up @@ -335,45 +371,73 @@ int main(int argc, char* argv[]) {
if (format_id == QINT8) {
OPTForCausalLM model = OPTForCausalLM("INT8/" + m_path, get_opt_model_config(model_id));
std::cout << "Finished!" << std::endl;

// Get input from the user
std::cout << "USER: ";
std::string input;
std::getline(std::cin, input);
if (use_voicechat){
int result = std::system("./application/sts_utils/listen");
std::ifstream in("tmpfile");
std::getline(in, input);
result = std::system("rm tmpfile");
(void)result;
std::cout << input << std::endl;
} else {
std::cout << "USER: ";
std::getline(std::cin, input);
}
std::vector<int> input_ids = encoder.encode(input);
std::string decoded = encoder.decode(input_ids);

// Generate
std::vector<int> generated_ids =
OPTGenerate(&model, OPT_INT8, input_ids, generation_config, &encoder, true, false);
OPTGenerate(&model, OPT_INT8, input_ids, generation_config, &encoder, true, use_voicechat);
} else if (format_id == FP32) {
Fp32OPTForCausalLM model = Fp32OPTForCausalLM(m_path, get_opt_model_config(model_id));
std::cout << "Finished!" << std::endl;

// Get input from the user
std::cout << "USER: ";
std::string input;
std::getline(std::cin, input);
if (use_voicechat){
int result = std::system("./application/sts_utils/listen");
std::ifstream in("tmpfile");
std::getline(in, input);
result = std::system("rm tmpfile");
(void)result;
std::cout << input << std::endl;
} else {
std::cout << "USER: ";
std::getline(std::cin, input);
}
std::vector<int> input_ids = encoder.encode(input);
std::string decoded = encoder.decode(input_ids);

// Generate
std::vector<int> generated_ids =
OPTGenerate(&model, OPT_FP32, input_ids, generation_config, &encoder, true, false);
OPTGenerate(&model, OPT_FP32, input_ids, generation_config, &encoder, true, use_voicechat);
} else if (format_id == INT4) {
Int4OPTForCausalLM model = Int4OPTForCausalLM("INT4/" + m_path, get_opt_model_config(model_id));
std::cout << "Finished!" << std::endl;

// Get input from the user
std::cout << "USER: ";
std::string input;
std::getline(std::cin, input);
if (use_voicechat){
int result = std::system("./application/sts_utils/listen");
std::ifstream in("tmpfile");
std::getline(in, input);
result = std::system("rm tmpfile");
(void)result;
std::cout << input << std::endl;
} else {
std::cout << "USER: ";
std::getline(std::cin, input);
}

std::vector<int> input_ids = encoder.encode(input);
std::string decoded = encoder.decode(input_ids);

// Generate
std::vector<int> generated_ids =
OPTGenerate(&model, OPT_INT4, input_ids, generation_config, &encoder, true, false);
OPTGenerate(&model, OPT_INT4, input_ids, generation_config, &encoder, true, use_voicechat);
}
#endif // QN_CUDA
}
Expand Down
55 changes: 45 additions & 10 deletions llm/application/sts_utils/clean_up.patch
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,26 @@ index c598633..b342f16 100644

m_sample_rate = capture_spec_obtained.freq;
diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index 4c7f7d1..53304a6 100644
index 4c7f7d1..60845f4 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -171,7 +171,7 @@ int main(int argc, char ** argv) {
@@ -139,10 +139,15 @@ int main(int argc, char ** argv) {

const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line

+ if (use_vad){
+ fprintf(stderr, "USER: ");
+ }
+
params.no_timestamps = !use_vad;
params.no_context |= use_vad;
params.max_tokens = 0;

+
// init audio

audio_async audio(params.length_ms);
@@ -171,7 +176,7 @@ int main(int argc, char ** argv) {

// print some info about the processing
{
Expand All @@ -64,7 +80,7 @@ index 4c7f7d1..53304a6 100644
if (!whisper_is_multilingual(ctx)) {
if (params.language != "en" || params.translate) {
params.language = "en";
@@ -179,24 +179,23 @@ int main(int argc, char ** argv) {
@@ -179,24 +184,21 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
}
}
Expand All @@ -91,17 +107,17 @@ index 4c7f7d1..53304a6 100644

if (!use_vad) {
fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
} else {
- } else {
- fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
+ fprintf(stderr, "USER: ");
}
- }
-
- fprintf(stderr, "\n");
+ }
+ // fprintf(stderr, "\n");
}

int n_iter = 0;
@@ -211,11 +210,9 @@ int main(int argc, char ** argv) {
@@ -211,11 +213,9 @@ int main(int argc, char ** argv) {
return 1;
}
}
Expand All @@ -114,7 +130,7 @@ index 4c7f7d1..53304a6 100644
const auto t_start = t_last;

// main audio loop
@@ -329,10 +326,6 @@ int main(int argc, char ** argv) {
@@ -329,10 +329,6 @@ int main(int argc, char ** argv) {
} else {
const int64_t t1 = (t_last - t_start).count()/1000000;
const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
Expand All @@ -125,7 +141,7 @@ index 4c7f7d1..53304a6 100644
}

const int n_segments = whisper_full_n_segments(ctx);
@@ -349,20 +342,10 @@ int main(int argc, char ** argv) {
@@ -349,20 +345,11 @@ int main(int argc, char ** argv) {
} else {
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
Expand All @@ -140,6 +156,7 @@ index 4c7f7d1..53304a6 100644
-
- printf("%s", output.c_str());
- fflush(stdout);
+ text += 1;
+ printf ("%s\n", text);

if (params.fname_out.length() > 0) {
Expand All @@ -148,7 +165,7 @@ index 4c7f7d1..53304a6 100644
}
}
}
@@ -372,8 +355,7 @@ int main(int argc, char ** argv) {
@@ -372,8 +359,7 @@ int main(int argc, char ** argv) {
}

if (use_vad){
Expand All @@ -158,6 +175,24 @@ index 4c7f7d1..53304a6 100644
}
}

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 50df20e..2ebef36 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1835,11 +1835,11 @@ void ggml_init_cublas() {
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0;
- fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
+ // fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
for (int id = 0; id < g_device_count; ++id) {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
- fprintf(stderr, " Device %d: %s\n", id, prop.name);
+ // fprintf(stderr, " Device %d: %s\n", id, prop.name);
g_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;
}
diff --git a/whisper.cpp b/whisper.cpp
index 9923fa0..bcfc5d9 100644
--- a/whisper.cpp
Expand Down
2 changes: 1 addition & 1 deletion llm/application/sts_utils/listen
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ options:
-tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model)
comm

./whisper.cpp/stream -m ./whisper.cpp/models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.7 -c 1 > tmpfile
./whisper.cpp/stream -m ./whisper.cpp/models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6 -c 1 > tmpfile
Loading

0 comments on commit 660229b

Please sign in to comment.