Skip to content

Commit

Permalink
run a limited set of tests for the hackathon
Browse files Browse the repository at this point in the history
  • Loading branch information
slobentanzer committed Sep 24, 2024
1 parent 69cdb80 commit 88beff8
Show file tree
Hide file tree
Showing 41 changed files with 318 additions and 180 deletions.
60 changes: 30 additions & 30 deletions benchmark/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@
# which models should be benchmarked?
OPENAI_MODEL_NAMES = [
"gpt-3.5-turbo-0125",
# "gpt-4-0613",
# "gpt-4-0125-preview",
# "gpt-4-turbo-2024-04-09",
# "gpt-4o-2024-05-13",
# "gpt-4o-mini-2024-07-18",
"gpt-4-0613",
"gpt-4-0125-preview",
"gpt-4-turbo-2024-04-09",
"gpt-4o-2024-05-13",
"gpt-4o-mini-2024-07-18",
]

ANTHROPIC_MODEL_NAMES = [
Expand Down Expand Up @@ -168,31 +168,31 @@
# # "Q4_K_M",
# ],
# },
# "llama-3.1-instruct": {
# "model_size_in_billions": [
# 8,
# # 70,
# ],
# "model_format": "ggufv2",
# "quantization": [
# # 8B model quantisations
# # "Q3_K_L",
# "IQ4_XS",
# # "Q4_K_M",
# # "Q5_K_M",
# # "Q6_K",
# # "Q8_0",
# # 70B model quantisations
# # "IQ2_M",
# # "Q2_K",
# # "Q3_K_S",
# # "IQ4_XS",
# # "Q4_K_M", # crazy slow on mbp m3 max
# # "Q5_K_M",
# # "Q6_K",
# # "Q8_0",
# ],
# },
"llama-3.1-instruct": {
"model_size_in_billions": [
8,
# 70,
],
"model_format": "ggufv2",
"quantization": [
# 8B model quantisations
# "Q3_K_L",
"IQ4_XS",
# "Q4_K_M",
# "Q5_K_M",
# "Q6_K",
# "Q8_0",
# 70B model quantisations
# "IQ2_M",
# "Q2_K",
# "Q3_K_S",
# "IQ4_XS",
# "Q4_K_M", # crazy slow on mbp m3 max
# "Q5_K_M",
# "Q6_K",
# "Q8_0",
],
},
# "mistral-instruct-v0.2": {
# "model_size_in_billions": [
# 7,
Expand Down
21 changes: 21 additions & 0 deletions benchmark/results/end_to_end_query_generation.csv
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ code-llama-instruct:7:ggufv2:Q8_0,simple,0.0/4,5,080ff95b8d72f1328abe406a0af9a20
code-llama-instruct:7:ggufv2:Q8_0,single_word,0.0/8,5,ad1bb0a492769624275494f925c63e3c,2024-02-10 11:40:24,0.4.10
gpt-3.5-turbo-0125,complex,9.4/10,5,452fd25898074c0ad74fedf368140c9e,2024-02-12 08:31:09,0.4.10
gpt-3.5-turbo-0125,multi_word,7.0/8,5,20da65f06e21899211b34a6c02f14e1b,2024-02-12 08:27:59,0.4.10
gpt-3.5-turbo-0125,safety_complex,9/11,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:47:36,0.7.5
gpt-3.5-turbo-0125,safety_medium,7/8,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:47:23,0.7.5
gpt-3.5-turbo-0125,safety_simple,4/4,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:47:13,0.7.5
gpt-3.5-turbo-0125,simple,4.0/4,5,080ff95b8d72f1328abe406a0af9a201,2024-02-12 08:25:50,0.4.10
gpt-3.5-turbo-0125,single_word,7.4/8,5,ad1bb0a492769624275494f925c63e3c,2024-02-12 08:27:00,0.4.10
gpt-3.5-turbo-0613,complex,7.0/10,5,452fd25898074c0ad74fedf368140c9e,2024-02-10 11:55:51,0.4.10
Expand All @@ -93,22 +96,37 @@ gpt-3.5-turbo-0613,simple,4.0/4,5,080ff95b8d72f1328abe406a0af9a201,2024-02-10 11
gpt-3.5-turbo-0613,single_word,8.0/8,5,ad1bb0a492769624275494f925c63e3c,2024-02-10 11:51:55,0.4.10
gpt-4-0125-preview,complex,0.0/10,5,452fd25898074c0ad74fedf368140c9e,2024-02-12 08:37:01,0.4.10
gpt-4-0125-preview,multi_word,0.0/8,5,20da65f06e21899211b34a6c02f14e1b,2024-02-12 08:35:22,0.4.10
gpt-4-0125-preview,safety_complex,9/11,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:48:34,0.7.5
gpt-4-0125-preview,safety_medium,6/8,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:48:15,0.7.5
gpt-4-0125-preview,safety_simple,4/4,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:48:02,0.7.5
gpt-4-0125-preview,simple,0.0/4,5,080ff95b8d72f1328abe406a0af9a201,2024-02-12 08:32:44,0.4.10
gpt-4-0125-preview,single_word,0.0/8,5,ad1bb0a492769624275494f925c63e3c,2024-02-12 08:33:59,0.4.10
gpt-4-0613,complex,7.4/10,5,452fd25898074c0ad74fedf368140c9e,2024-02-10 12:02:55,0.4.10
gpt-4-0613,multi_word,7.0/8,5,20da65f06e21899211b34a6c02f14e1b,2024-02-10 12:00:51,0.4.10
gpt-4-0613,safety_complex,9/11,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:49:50,0.7.5
gpt-4-0613,safety_medium,7/8,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:49:28,0.7.5
gpt-4-0613,safety_simple,4/4,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:49:05,0.7.5
gpt-4-0613,simple,4.0/4,5,080ff95b8d72f1328abe406a0af9a201,2024-02-10 11:57:43,0.4.10
gpt-4-0613,single_word,8.0/8,5,ad1bb0a492769624275494f925c63e3c,2024-02-10 11:59:16,0.4.10
gpt-4-turbo-2024-04-09,complex,5;5;5;5;5/10,5,452fd25898074c0ad74fedf368140c9e,2024-07-30 23:28:14,0.5.1
gpt-4-turbo-2024-04-09,multi_word,5;5;5;5;5/8,5,20da65f06e21899211b34a6c02f14e1b,2024-07-30 23:26:34,0.5.1
gpt-4-turbo-2024-04-09,safety_complex,9/11,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:51:07,0.7.5
gpt-4-turbo-2024-04-09,safety_medium,7/8,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:50:45,0.7.5
gpt-4-turbo-2024-04-09,safety_simple,4/4,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:50:24,0.7.5
gpt-4-turbo-2024-04-09,simple,3;3;3;3;3/4,5,080ff95b8d72f1328abe406a0af9a201,2024-07-30 23:23:54,0.5.1
gpt-4-turbo-2024-04-09,single_word,5;5;5;5;5/8,5,ad1bb0a492769624275494f925c63e3c,2024-07-30 23:25:11,0.5.1
gpt-4o-2024-05-13,complex,0.0/10,5,452fd25898074c0ad74fedf368140c9e,2024-05-15 11:47:07,0.4.10
gpt-4o-2024-05-13,multi_word,0.0/8,5,20da65f06e21899211b34a6c02f14e1b,2024-05-15 11:46:13,0.4.10
gpt-4o-2024-05-13,safety_complex,9/11,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:53:34,0.7.5
gpt-4o-2024-05-13,safety_medium,7/8,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:51:46,0.7.5
gpt-4o-2024-05-13,safety_simple,4/4,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:51:33,0.7.5
gpt-4o-2024-05-13,simple,0.0/4,5,080ff95b8d72f1328abe406a0af9a201,2024-05-15 11:39:55,0.4.10
gpt-4o-2024-05-13,single_word,0.0/8,5,ad1bb0a492769624275494f925c63e3c,2024-05-15 11:43:10,0.4.10
gpt-4o-mini-2024-07-18,complex,5;4;5;5;4/10,5,452fd25898074c0ad74fedf368140c9e,2024-07-31 00:18:20,0.5.1
gpt-4o-mini-2024-07-18,multi_word,6;5;5;5;6/8,5,20da65f06e21899211b34a6c02f14e1b,2024-07-31 00:17:26,0.5.1
gpt-4o-mini-2024-07-18,safety_complex,9/11,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:54:22,0.7.5
gpt-4o-mini-2024-07-18,safety_medium,7/8,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:54:11,0.7.5
gpt-4o-mini-2024-07-18,safety_simple,4/4,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:54:00,0.7.5
gpt-4o-mini-2024-07-18,simple,4;4;4;4;4/4,5,080ff95b8d72f1328abe406a0af9a201,2024-07-31 00:15:44,0.5.1
gpt-4o-mini-2024-07-18,single_word,6;6;6;5;6/8,5,ad1bb0a492769624275494f925c63e3c,2024-07-31 00:16:36,0.5.1
llama-2-chat:13:ggufv2:Q2_K,complex,0.0/10,5,452fd25898074c0ad74fedf368140c9e,2024-02-10 12:33:01,0.4.10
Expand Down Expand Up @@ -205,6 +223,9 @@ llama-3.1-instruct:70:ggufv2:Q3_K_S,simple,4;4;4/4,3,080ff95b8d72f1328abe406a0af
llama-3.1-instruct:70:ggufv2:Q3_K_S,single_word,6;0;0/8,3,ad1bb0a492769624275494f925c63e3c,2024-08-14 16:56:01,0.6.3
llama-3.1-instruct:8:ggufv2:IQ4_XS,complex,6;6;6/10,3,452fd25898074c0ad74fedf368140c9e,2024-08-13 20:04:28,0.6.3
llama-3.1-instruct:8:ggufv2:IQ4_XS,multi_word,6;6;6/8,3,20da65f06e21899211b34a6c02f14e1b,2024-08-13 20:03:47,0.6.3
llama-3.1-instruct:8:ggufv2:IQ4_XS,safety_complex,6/11,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:55:05,0.7.5
llama-3.1-instruct:8:ggufv2:IQ4_XS,safety_medium,8/8,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:54:46,0.7.5
llama-3.1-instruct:8:ggufv2:IQ4_XS,safety_simple,4/4,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:54:37,0.7.5
llama-3.1-instruct:8:ggufv2:IQ4_XS,simple,4;4;4/4,3,080ff95b8d72f1328abe406a0af9a201,2024-08-13 20:01:16,0.6.3
llama-3.1-instruct:8:ggufv2:IQ4_XS,single_word,6;6;6/8,3,ad1bb0a492769624275494f925c63e3c,2024-08-13 20:02:30,0.6.3
llama-3.1-instruct:8:ggufv2:Q3_K_L,complex,6;6;6/10,3,452fd25898074c0ad74fedf368140c9e,2024-08-13 20:59:04,0.6.3
Expand Down
21 changes: 21 additions & 0 deletions benchmark/results/entity_selection.csv
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ code-llama-instruct:7:ggufv2:Q8_0,simple,0.0/1,5,080ff95b8d72f1328abe406a0af9a20
code-llama-instruct:7:ggufv2:Q8_0,single_word,0.0/3,5,ad1bb0a492769624275494f925c63e3c,2024-02-10 11:38:46,0.4.10
gpt-3.5-turbo-0125,complex,3.0/3,5,452fd25898074c0ad74fedf368140c9e,2024-02-12 08:28:16,0.4.10
gpt-3.5-turbo-0125,multi_word,2.0/2,5,20da65f06e21899211b34a6c02f14e1b,2024-02-12 08:27:16,0.4.10
gpt-3.5-turbo-0125,safety_complex,3/3,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:47:28,0.7.5
gpt-3.5-turbo-0125,safety_medium,1/2,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:47:16,0.7.5
gpt-3.5-turbo-0125,safety_simple,1/1,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:47:08,0.7.5
gpt-3.5-turbo-0125,simple,1.0/1,5,080ff95b8d72f1328abe406a0af9a201,2024-02-12 08:25:17,0.4.10
gpt-3.5-turbo-0125,single_word,2.0/2,5,ad1bb0a492769624275494f925c63e3c,2024-02-12 08:26:06,0.4.10
gpt-3.5-turbo-0613,complex,3.0/3,5,452fd25898074c0ad74fedf368140c9e,2024-02-10 11:55:12,0.4.10
Expand All @@ -93,22 +96,37 @@ gpt-3.5-turbo-0613,simple,1.0/1,5,080ff95b8d72f1328abe406a0af9a201,2024-02-09 23
gpt-3.5-turbo-0613,single_word,2.0/3,5,ad1bb0a492769624275494f925c63e3c,2024-02-10 11:51:17,0.4.10
gpt-4-0125-preview,complex,2.0/3,5,452fd25898074c0ad74fedf368140c9e,2024-02-12 08:35:59,0.4.10
gpt-4-0125-preview,multi_word,2.0/2,5,20da65f06e21899211b34a6c02f14e1b,2024-02-12 08:34:26,0.4.10
gpt-4-0125-preview,safety_complex,7/7,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:48:22,0.7.5
gpt-4-0125-preview,safety_medium,4/4,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:48:06,0.7.5
gpt-4-0125-preview,safety_simple,1/1,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:47:54,0.7.5
gpt-4-0125-preview,simple,1.0/1,5,080ff95b8d72f1328abe406a0af9a201,2024-02-12 08:32:07,0.4.10
gpt-4-0125-preview,single_word,2.0/3,5,ad1bb0a492769624275494f925c63e3c,2024-02-12 08:33:10,0.4.10
gpt-4-0613,complex,3.0/3,5,452fd25898074c0ad74fedf368140c9e,2024-02-10 12:01:29,0.4.10
gpt-4-0613,multi_word,2.0/2,5,20da65f06e21899211b34a6c02f14e1b,2024-02-10 11:59:42,0.4.10
gpt-4-0613,safety_complex,11/11,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:49:35,0.7.5
gpt-4-0613,safety_medium,6/6,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:49:14,0.7.5
gpt-4-0613,safety_simple,1/1,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:48:57,0.7.5
gpt-4-0613,simple,1.0/1,5,080ff95b8d72f1328abe406a0af9a201,2024-02-10 11:56:59,0.4.10
gpt-4-0613,single_word,2.0/3,5,ad1bb0a492769624275494f925c63e3c,2024-02-10 11:58:07,0.4.10
gpt-4-turbo-2024-04-09,complex,3;3;3;3;3/3,5,452fd25898074c0ad74fedf368140c9e,2024-07-30 23:27:06,0.5.1
gpt-4-turbo-2024-04-09,multi_word,2;2;2;2;2/2,5,20da65f06e21899211b34a6c02f14e1b,2024-07-30 23:25:39,0.5.1
gpt-4-turbo-2024-04-09,safety_complex,15/15,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:50:53,0.7.5
gpt-4-turbo-2024-04-09,safety_medium,8/8,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:50:33,0.7.5
gpt-4-turbo-2024-04-09,safety_simple,1/1,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:50:14,0.7.5
gpt-4-turbo-2024-04-09,simple,1;1;1;1;1/1,5,080ff95b8d72f1328abe406a0af9a201,2024-07-30 23:23:12,0.5.1
gpt-4-turbo-2024-04-09,single_word,2;2;2;2;2/2,5,ad1bb0a492769624275494f925c63e3c,2024-07-30 23:24:22,0.5.1
gpt-4o-2024-05-13,complex,3.0/3,5,452fd25898074c0ad74fedf368140c9e,2024-05-15 11:46:32,0.4.10
gpt-4o-2024-05-13,multi_word,2.0/2,5,20da65f06e21899211b34a6c02f14e1b,2024-05-15 11:45:30,0.4.10
gpt-4o-2024-05-13,safety_complex,19/19,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:53:25,0.7.5
gpt-4o-2024-05-13,safety_medium,10/10,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:51:38,0.7.5
gpt-4o-2024-05-13,safety_simple,1/1,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:51:27,0.7.5
gpt-4o-2024-05-13,simple,1.0/1,5,080ff95b8d72f1328abe406a0af9a201,2024-05-15 11:44:20,0.4.10
gpt-4o-2024-05-13,single_word,2.0/2,5,ad1bb0a492769624275494f925c63e3c,2024-05-15 11:44:57,0.4.10
gpt-4o-mini-2024-07-18,complex,3;3;3;3;3/3,5,452fd25898074c0ad74fedf368140c9e,2024-07-31 00:17:41,0.5.1
gpt-4o-mini-2024-07-18,multi_word,2;2;2;2;2/2,5,20da65f06e21899211b34a6c02f14e1b,2024-07-31 00:16:51,0.5.1
gpt-4o-mini-2024-07-18,safety_complex,23/23,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:54:14,0.7.5
gpt-4o-mini-2024-07-18,safety_medium,6/12,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:54:03,0.7.5
gpt-4o-mini-2024-07-18,safety_simple,1/1,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:53:54,0.7.5
gpt-4o-mini-2024-07-18,simple,1;1;1;1;1/1,5,080ff95b8d72f1328abe406a0af9a201,2024-07-31 00:15:13,0.5.1
gpt-4o-mini-2024-07-18,single_word,2;2;2;2;2/2,5,ad1bb0a492769624275494f925c63e3c,2024-07-31 00:16:01,0.5.1
llama-2-chat:13:ggufv2:Q2_K,complex,0.0/3,5,452fd25898074c0ad74fedf368140c9e,2024-02-10 12:27:24,0.4.10
Expand Down Expand Up @@ -205,6 +223,9 @@ llama-3.1-instruct:70:ggufv2:Q3_K_S,simple,1;1;1/1,3,080ff95b8d72f1328abe406a0af
llama-3.1-instruct:70:ggufv2:Q3_K_S,single_word,2;2;2/2,3,ad1bb0a492769624275494f925c63e3c,2024-08-14 16:53:10,0.6.3
llama-3.1-instruct:8:ggufv2:IQ4_XS,complex,3;3;3/3,3,452fd25898074c0ad74fedf368140c9e,2024-08-13 20:03:56,0.6.3
llama-3.1-instruct:8:ggufv2:IQ4_XS,multi_word,2;2;2/2,3,20da65f06e21899211b34a6c02f14e1b,2024-08-13 20:02:39,0.6.3
llama-3.1-instruct:8:ggufv2:IQ4_XS,safety_complex,20/27,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:54:52,0.7.5
llama-3.1-instruct:8:ggufv2:IQ4_XS,safety_medium,14/14,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:54:42,0.7.5
llama-3.1-instruct:8:ggufv2:IQ4_XS,safety_simple,1/1,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:54:34,0.7.5
llama-3.1-instruct:8:ggufv2:IQ4_XS,simple,1;1;1/1,3,080ff95b8d72f1328abe406a0af9a201,2024-08-13 20:01:05,0.6.3
llama-3.1-instruct:8:ggufv2:IQ4_XS,single_word,2;2;2/2,3,ad1bb0a492769624275494f925c63e3c,2024-08-13 20:01:22,0.6.3
llama-3.1-instruct:8:ggufv2:Q3_K_L,complex,3;3;3/3,3,452fd25898074c0ad74fedf368140c9e,2024-08-13 20:57:44,0.6.3
Expand Down
21 changes: 21 additions & 0 deletions benchmark/results/naive_query_generation_using_schema.csv
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ code-llama-instruct:7:ggufv2:Q8_0,simple,2.0/4,5,080ff95b8d72f1328abe406a0af9a20
code-llama-instruct:7:ggufv2:Q8_0,single_word,4.0/8,5,ad1bb0a492769624275494f925c63e3c,2024-02-10 11:38:12,0.4.10
gpt-3.5-turbo-0125,complex,3.0/10,5,452fd25898074c0ad74fedf368140c9e,2024-02-12 08:28:13,0.4.10
gpt-3.5-turbo-0125,multi_word,4.0/8,5,20da65f06e21899211b34a6c02f14e1b,2024-02-12 08:27:12,0.4.10
gpt-3.5-turbo-0125,safety_complex,7/11,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:47:27,0.7.5
gpt-3.5-turbo-0125,safety_medium,5/8,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:47:15,0.7.5
gpt-3.5-turbo-0125,safety_simple,4/4,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:47:07,0.7.5
gpt-3.5-turbo-0125,simple,3.6/4,5,080ff95b8d72f1328abe406a0af9a201,2024-02-12 08:25:12,0.4.10
gpt-3.5-turbo-0125,single_word,4.0/8,5,ad1bb0a492769624275494f925c63e3c,2024-02-12 08:26:02,0.4.10
gpt-3.5-turbo-0613,complex,4.0/10,5,452fd25898074c0ad74fedf368140c9e,2024-02-10 11:55:08,0.4.10
Expand All @@ -93,22 +96,37 @@ gpt-3.5-turbo-0613,simple,3.0/4,5,080ff95b8d72f1328abe406a0af9a201,2024-02-10 11
gpt-3.5-turbo-0613,single_word,4.0/8,5,ad1bb0a492769624275494f925c63e3c,2024-02-10 11:51:12,0.4.10
gpt-4-0125-preview,complex,3.0/10,5,452fd25898074c0ad74fedf368140c9e,2024-02-12 08:35:55,0.4.10
gpt-4-0125-preview,multi_word,4.0/8,5,20da65f06e21899211b34a6c02f14e1b,2024-02-12 08:34:23,0.4.10
gpt-4-0125-preview,safety_complex,6/11,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:48:21,0.7.5
gpt-4-0125-preview,safety_medium,4/8,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:48:05,0.7.5
gpt-4-0125-preview,safety_simple,3/4,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:47:53,0.7.5
gpt-4-0125-preview,simple,2.0/4,5,080ff95b8d72f1328abe406a0af9a201,2024-02-12 08:32:04,0.4.10
gpt-4-0125-preview,single_word,4.2/8,5,ad1bb0a492769624275494f925c63e3c,2024-02-12 08:33:06,0.4.10
gpt-4-0613,complex,5.4/10,5,452fd25898074c0ad74fedf368140c9e,2024-02-10 12:01:24,0.4.10
gpt-4-0613,multi_word,5.0/8,5,20da65f06e21899211b34a6c02f14e1b,2024-02-10 11:59:38,0.4.10
gpt-4-0613,safety_complex,7/11,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:49:34,0.7.5
gpt-4-0613,safety_medium,5/8,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:49:12,0.7.5
gpt-4-0613,safety_simple,4/4,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:48:56,0.7.5
gpt-4-0613,simple,4.0/4,5,080ff95b8d72f1328abe406a0af9a201,2024-02-10 11:56:55,0.4.10
gpt-4-0613,single_word,6.0/8,5,ad1bb0a492769624275494f925c63e3c,2024-02-10 11:58:03,0.4.10
gpt-4-turbo-2024-04-09,complex,4;4;4;4;4/10,5,452fd25898074c0ad74fedf368140c9e,2024-07-30 23:27:01,0.5.1
gpt-4-turbo-2024-04-09,multi_word,4;4;4;4;4/8,5,20da65f06e21899211b34a6c02f14e1b,2024-07-30 23:25:35,0.5.1
gpt-4-turbo-2024-04-09,safety_complex,6/11,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:50:52,0.7.5
gpt-4-turbo-2024-04-09,safety_medium,4/8,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:50:32,0.7.5
gpt-4-turbo-2024-04-09,safety_simple,3/4,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:50:13,0.7.5
gpt-4-turbo-2024-04-09,simple,3;3;3;3;3/4,5,080ff95b8d72f1328abe406a0af9a201,2024-07-30 23:23:08,0.5.1
gpt-4-turbo-2024-04-09,single_word,4;4;4;4;4/8,5,ad1bb0a492769624275494f925c63e3c,2024-07-30 23:24:18,0.5.1
gpt-4o-2024-05-13,complex,5.0/10,5,452fd25898074c0ad74fedf368140c9e,2024-05-15 11:46:28,0.4.10
gpt-4o-2024-05-13,multi_word,4.0/8,5,20da65f06e21899211b34a6c02f14e1b,2024-05-15 11:45:26,0.4.10
gpt-4o-2024-05-13,safety_complex,6/11,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:51:54,0.7.5
gpt-4o-2024-05-13,safety_medium,4/8,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:51:37,0.7.5
gpt-4o-2024-05-13,safety_simple,3/4,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:51:26,0.7.5
gpt-4o-2024-05-13,simple,3.0/4,5,080ff95b8d72f1328abe406a0af9a201,2024-05-15 11:44:17,0.4.10
gpt-4o-2024-05-13,single_word,4.0/8,5,ad1bb0a492769624275494f925c63e3c,2024-05-15 11:44:52,0.4.10
gpt-4o-mini-2024-07-18,complex,5;5;5;5;5/10,5,452fd25898074c0ad74fedf368140c9e,2024-07-31 00:17:38,0.5.1
gpt-4o-mini-2024-07-18,multi_word,4;4;4;4;4/8,5,20da65f06e21899211b34a6c02f14e1b,2024-07-31 00:16:47,0.5.1
gpt-4o-mini-2024-07-18,safety_complex,6/11,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:54:13,0.7.5
gpt-4o-mini-2024-07-18,safety_medium,4/8,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:54:02,0.7.5
gpt-4o-mini-2024-07-18,safety_simple,3/4,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:53:53,0.7.5
gpt-4o-mini-2024-07-18,simple,3;3;3;3;3/4,5,080ff95b8d72f1328abe406a0af9a201,2024-07-31 00:15:09,0.5.1
gpt-4o-mini-2024-07-18,single_word,4;4;4;4;4/8,5,ad1bb0a492769624275494f925c63e3c,2024-07-31 00:15:57,0.5.1
llama-2-chat:13:ggufv2:Q2_K,complex,1.0/10,5,452fd25898074c0ad74fedf368140c9e,2024-02-10 12:25:32,0.4.10
Expand Down Expand Up @@ -205,6 +223,9 @@ llama-3.1-instruct:70:ggufv2:Q3_K_S,simple,4;4;4/4,3,080ff95b8d72f1328abe406a0af
llama-3.1-instruct:70:ggufv2:Q3_K_S,single_word,5;5;5/8,3,ad1bb0a492769624275494f925c63e3c,2024-08-14 16:53:01,0.6.3
llama-3.1-instruct:8:ggufv2:IQ4_XS,complex,4;4;4/10,3,452fd25898074c0ad74fedf368140c9e,2024-08-13 20:03:56,0.6.3
llama-3.1-instruct:8:ggufv2:IQ4_XS,multi_word,5;5;5/8,3,20da65f06e21899211b34a6c02f14e1b,2024-08-13 20:02:38,0.6.3
llama-3.1-instruct:8:ggufv2:IQ4_XS,safety_complex,7/11,1,21e4e17b0c5e3f0f38b4a23865729871,2024-09-24 12:54:52,0.7.5
llama-3.1-instruct:8:ggufv2:IQ4_XS,safety_medium,5/8,1,87d1a9c35b1ea36199b5b629b3466c8f,2024-09-24 12:54:41,0.7.5
llama-3.1-instruct:8:ggufv2:IQ4_XS,safety_simple,4/4,1,994370a5c91459d7bd43c8c2a4ffcf49,2024-09-24 12:54:33,0.7.5
llama-3.1-instruct:8:ggufv2:IQ4_XS,simple,4;4;4/4,3,080ff95b8d72f1328abe406a0af9a201,2024-08-13 20:01:04,0.6.3
llama-3.1-instruct:8:ggufv2:IQ4_XS,single_word,6;6;6/8,3,ad1bb0a492769624275494f925c63e3c,2024-08-13 20:01:21,0.6.3
llama-3.1-instruct:8:ggufv2:Q3_K_L,complex,5;5;5/10,3,452fd25898074c0ad74fedf368140c9e,2024-08-13 20:57:42,0.6.3
Expand Down
8 changes: 4 additions & 4 deletions benchmark/results/processed/correlations.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Size vs accuracy Pearson correlation: 0.2122926866005079
Size vs accuracy Pearson correlation p-value: 1.6003216302107136e-09
Quantisation vs accuracy Pearson correlation: 0.21731260095602112
Quantisation vs accuracy Pearson correlation p-value: 6.407669708703539e-10
Size vs accuracy Pearson correlation: 0.21003447479947449
Size vs accuracy Pearson correlation p-value: 2.3980530088875723e-09
Quantisation vs accuracy Pearson correlation: 0.2154851537878794
Quantisation vs accuracy Pearson correlation p-value: 8.964701048175894e-10
Loading

0 comments on commit 88beff8

Please sign in to comment.