Update VILA and UI (#97)

mit-han-lab · Mar 3, 2024 · 6fb8866 · 6fb8866
1 parent d0fed69
commit 6fb8866
Show file tree

Hide file tree

Showing 22 changed files with 1,096 additions and 71 deletions.
diff --git a/README.md b/README.md
@@ -151,15 +151,15 @@ TinyChatEngine offers versatile capabilities suitable for various applications.
 
 - Start the speech-to-speech chat locally.
   ```bash
-  ./chat -v  # chat.exe -v on Windows
+  ./voicechat  # chat.exe -v on Windows
   ```
 
 - If you encounter any issues or errors during setup, please explore [here](llm/application/README.md) to follow the step-by-step guide to debug.
 
 
 ## Deploy vision language model (VLM) chatbot with TinyChatEngine
 
-TinyChatEngine supports not only LLM but also VLM. We introduce a sophisticated text/voice chatbot for VLM. Here, we provide very easy-to-follow instructions to deploy vision language model chatbot (VILA-7B) with TinyChatEngine.
+TinyChatEngine supports not only LLM but also VLM. We introduce a sophisticated text/voice chatbot for VLM. Here, we provide easy-to-follow instructions to deploy vision language model chatbot (VILA-7B) with TinyChatEngine. We recommend using M1/M2 MacBooks for this VLM feature.
 
 - Follow the instructions above to setup the basic environment, i.e., [Prerequisites](#prerequisites) and [Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine](#step-by-step-to-deploy-llama2-7b-chat-with-tinychatengine).
 
@@ -169,6 +169,10 @@ TinyChatEngine supports not only LLM but also VLM. We introduce a sophisticated
   - (For other OS) Please refer to [here](https://github.com/AnonymouX47/termvisage?tab=readme-ov-file#requirements) to get the appropriate terminal ready.
 
 - (Optional) To enable the speech-to-speech chatbot for VLM, please follow the [instruction above](#deploy-speech-to-speech-chatbot-with-tinychatengine-demo) to run the shell script to set up the environment.
+  ```bash
+  cd llm
+  ./voicechat_setup.sh
+  ```
 
 - Download the quantized VILA-7B model from our model zoo.
 
@@ -184,12 +188,12 @@ TinyChatEngine supports not only LLM but also VLM. We introduce a sophisticated
 - (For MacOS) Start the chatbot locally. Please use an appropriate terminal (e.g., iTerm2).
   - Image/Text to text
     ```bash
-    ./scripts/vila.sh ../assets/figures/vlm_demo/pedestrian.png
+    ./vila ../assets/figures/vlm_demo/pedestrian.png
     ```
 
   - Image/Speech to speech
     ```bash
-    ./scripts/voice_vila.sh ../assets/figures/vlm_demo/pedestrian.png
+    ./voice_vila ../assets/figures/vlm_demo/pedestrian.png
     ```
 
     - There are several images under the path `../assets/figures/vlm_demo`. Feel free to try different images with VILA on your device!

diff --git a/kernels/matmul.h b/kernels/matmul.h
@@ -99,15 +99,16 @@ struct thread_args {
     int start_i, end_i, blk_size;
 };
 
-
 #define MAX(A, B) ((A) > (B) ? (A) : (B))
 #define MIN(A, B) ((A) < (B) ? (A) : (B))
+
 namespace matmul {
 class MatmulOperator {
    public:
     void mat_mul_transposed(const struct matmul_params *params);
     void mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params);
     void mat_mul_accelerator_transposed_fastover_column_bias(const struct matmul_params *params);
+    void mat_mul_accelerator_untransposed_fastover_column(const struct matmul_params *params);
     // int8
     void naive_mat_mul_int8(const struct matmul_params *params);
     void mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params *params);
@@ -125,6 +126,8 @@ class MatmulOperator {
     void mat_mul_accelerator_int8_int4_fast_no_offset(struct matmul_params *params);
     void gemv_accelerator_int8_int4_fast_no_offset(struct matmul_params *params);
     void gemm_accelerator_int8_int4_fast_no_offset(struct matmul_params *params);
+    void gemm_accelerator_int8_int4_fast_no_offset_v2(struct matmul_params *params);
+    void cblas_gemm_accelerator_no_offset(struct matmul_params *params);
     void naive_mat_mul_int4(const struct matmul_params *params);
     void naive_mat_mul_int4_with_offset(const struct matmul_params *params);
     // cuda

diff --git a/kernels/neon/matmul_neon_fp32.cc b/kernels/neon/matmul_neon_fp32.cc
@@ -38,25 +38,46 @@ void fp32_ref_matmul(const struct matmul_params *params) {
     }
 }
 
-void fp32_matmul_cblas_gemm(const struct matmul_params *params) {
+inline void fp32_matmul_transposed_cblas_gemm(const struct matmul_params *params) {
+    const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
+    float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr;
+    float alpha = params->alpha;
+
+    assert(A->column == B->column);
+    assert(C->row == A->row);
+    assert(C->column == B->row);
+    int m = C->row, n = C->column, k = A->column;
+
+    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                m, n, k,
+                alpha, data_A, k,
+                       data_B, k,
+                0.0f,  data_C, n);
+}
+
+void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) {
+    // fp32_ref_matmul(params);
+    fp32_matmul_transposed_cblas_gemm(params);
+}
+
+inline void fp32_matmul_untransposed_cblas_gemm(const struct matmul_params *params) {
     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
     float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr;
 
     assert(A->column == B->row);
     assert(C->row == A->row);
     assert(C->column == B->column);
-    int m = A->row, n = B->column, k = A->column;
+    int m = C->row, n = C->column, k = A->column;
 
     cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-                          m, n, k,
-                         1.0f,    data_A, m,
-                                  data_B, k,
-                         0.0f,    data_C, m);
+                m, n, k,
+                1.0f, data_A, k,
+                      data_B, n,
+                0.0f, data_C, n);
 }
 
-void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) {
-    fp32_ref_matmul(params);
-    // fp32_matmul_cblas_gemm(params);
+void MatmulOperator::mat_mul_accelerator_untransposed_fastover_column(const struct matmul_params *params) {
+    fp32_matmul_untransposed_cblas_gemm(params);
 }
 
 void fp32_ref_matmul_bias(const struct matmul_params *params) {