Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmark-VHP #207

Merged
merged 12 commits into from
Oct 25, 2024
169 changes: 85 additions & 84 deletions benchmark/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,13 @@

# which models should be benchmarked?
OPENAI_MODEL_NAMES = [
# "gpt-3.5-turbo-0125",
# "gpt-4-0613",
# "gpt-4-0125-preview",
# "gpt-4-turbo-2024-04-09",
# "gpt-4o-2024-05-13",
# "gpt-4o-mini-2024-07-18",
"gpt-3.5-turbo-0125",
"gpt-4-0613",
"gpt-4-0125-preview",
"gpt-4-turbo-2024-04-09",
"gpt-4o-2024-05-13",
"gpt-4o-2024-08-06",
"gpt-4o-mini-2024-07-18",
]

ANTHROPIC_MODEL_NAMES = [
Expand All @@ -40,8 +41,8 @@
XINFERENCE_MODELS = {
# "code-llama-instruct": {
# "model_size_in_billions": [
# 7,
# 13,
# # 7,
# # 13,
# 34,
# ],
# "model_format": "ggufv2",
Expand Down Expand Up @@ -127,28 +128,28 @@
# # "FP16",
# ],
# },
# "llama-2-chat": {
# "model_size_in_billions": [
# 7,
# 13,
# # 70,
# ],
# "model_format": "ggufv2",
# "quantization": [
# "Q2_K",
# # "Q3_K_S",
# "Q3_K_M",
# # "Q3_K_L",
# # "Q4_0",
# # "Q4_K_S",
# "Q4_K_M",
# # "Q5_0",
# # "Q5_K_S",
# "Q5_K_M",
# "Q6_K",
# "Q8_0",
# ],
# },
"llama-2-chat": {
"model_size_in_billions": [
7,
# 13,
# 70,
],
"model_format": "ggufv2",
"quantization": [
"Q2_K",
# "Q3_K_S",
"Q3_K_M",
# "Q3_K_L",
# "Q4_0",
# "Q4_K_S",
"Q4_K_M",
# "Q5_0",
# "Q5_K_S",
"Q5_K_M",
"Q6_K",
"Q8_0",
],
},
# "llama-3-instruct": {
# "model_size_in_billions": [
# 8,
Expand All @@ -168,31 +169,31 @@
# # "Q4_K_M",
# ],
# },
# "llama-3.1-instruct": {
# "model_size_in_billions": [
# 8,
# # 70,
# ],
# "model_format": "ggufv2",
# "quantization": [
# # 8B model quantisations
# # "Q3_K_L",
# "IQ4_XS",
# # "Q4_K_M",
# # "Q5_K_M",
# # "Q6_K",
# # "Q8_0",
# # 70B model quantisations
# # "IQ2_M",
# # "Q2_K",
# # "Q3_K_S",
# # "IQ4_XS",
# # "Q4_K_M", # crazy slow on mbp m3 max
# # "Q5_K_M",
# # "Q6_K",
# # "Q8_0",
# ],
# },
"llama-3.1-instruct": {
"model_size_in_billions": [
8,
# 70,
],
"model_format": "ggufv2",
"quantization": [
# 8B model quantisations
"Q3_K_L",
"IQ4_XS",
"Q4_K_M",
# "Q5_K_M",
# "Q6_K",
"Q8_0",
# 70B model quantisations
# "IQ2_M",
# "Q2_K",
# "Q3_K_S",
# "IQ4_XS",
# "Q4_K_M", # crazy slow on mbp m3 max
# "Q5_K_M",
# "Q6_K",
# "Q8_0",
],
},
# "mistral-instruct-v0.2": {
# "model_size_in_billions": [
# 7,
Expand Down Expand Up @@ -238,26 +239,26 @@
# "none",
# ],
# },
# "openhermes-2.5": {
# "model_size_in_billions": [
# 7,
# ],
# "model_format": "ggufv2",
# "quantization": [
# "Q2_K",
# # "Q3_K_S",
# "Q3_K_M",
# # "Q3_K_L",
# # "Q4_0",
# # "Q4_K_S",
# "Q4_K_M",
# # "Q5_0",
# # "Q5_K_S",
# "Q5_K_M",
# "Q6_K",
# "Q8_0",
# ],
# },
"openhermes-2.5": {
"model_size_in_billions": [
7,
],
"model_format": "ggufv2",
"quantization": [
"Q2_K",
# "Q3_K_S",
"Q3_K_M",
# "Q3_K_L",
# "Q4_0",
# "Q4_K_S",
"Q4_K_M",
# "Q5_0",
# "Q5_K_S",
"Q5_K_M",
"Q6_K",
"Q8_0",
],
},
}

# create concrete benchmark list by concatenating all combinations of model
Expand Down Expand Up @@ -546,38 +547,38 @@ def pytest_generate_tests(metafunc):
Called once for each test case in the benchmark test collection.
If fixture is part of test declaration, the test is parametrized.
"""
# Load the data file
data_file = BENCHMARK_DATASET["benchmark_data.yaml"]
# Load the data
data = BENCHMARK_DATASET

# Parametrize the fixtures with the collected rows
if "test_data_biocypher_query_generation" in metafunc.fixturenames:
metafunc.parametrize(
"test_data_biocypher_query_generation",
data_file["biocypher_query_generation"],
data["biocypher_query_generation"],
)
if "test_data_rag_interpretation" in metafunc.fixturenames:
metafunc.parametrize(
"test_data_rag_interpretation",
data_file["rag_interpretation"],
data["rag_interpretation"],
)
if "test_data_text_extraction" in metafunc.fixturenames:
metafunc.parametrize(
"test_data_text_extraction",
data_file["text_extraction"],
data["text_extraction"],
)
if "test_data_api_calling" in metafunc.fixturenames:
metafunc.parametrize(
"test_data_api_calling",
data_file["api_calling"],
data["api_calling"],
)
if "test_data_medical_exam" in metafunc.fixturenames:
metafunc.parametrize(
"test_data_medical_exam",
data_file["medical_exam"],
data["medical_exam"],
)


@pytest.fixture
def kg_schemas():
data_file = BENCHMARK_DATASET["benchmark_data.yaml"]
return data_file["kg_schemas"]
data = BENCHMARK_DATASET
return data["kg_schemas"]
62 changes: 62 additions & 0 deletions benchmark/data/benchmark_api_calling_data.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Top-level keys: benchmark modules
# Values: list of dictionaries, each containing a test case
#
# Test case keys:
# - input (for creating the test)
# - expected (for asserting ourcomes and generating a score)
# - case (for categorizing the test case)
#
# If any input is a dictionary itself, it will be expanded into separate test
# cases, using the top-level key to create a concatenated test case purpose.

api_calling:
- case: oncokb:braf:melanoma
input:
prompt:
exact_spelling: "What is the consequence of the V600E BRAF variant in Melanoma?"
expected:
parts_of_query:
[
"https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange?",
"hugoSymbol=BRAF",
"alteration=V600E",
"tumorType=Melanoma",
]
- case: oncokb:tp53:colon_adenocarcinoma
input:
prompt:
exact_spelling: "What is the consequence of the R273C TP53 variant in Colon Adenocarcinoma?"
expected:
parts_of_query:
[
"https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange?hugoSymbol=TP53",
"alteration=R273C",
"tumorType=Colon%20Adenocarcinoma",
]
- case: oncokb:braf:histiocytosis
input:
prompt:
exact_spelling: "What is the consequence of the N486_P490del BRAF variant in Histiocytosis?"
descriptive_spelling: "What is the consequence of an N486_P490 deletion in BRAF in Histiocytosis?"
expected:
parts_of_query:
[
"https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange?",
"hugoSymbol=BRAF",
"alteration=N486_P490del",
"tumorType=Histiocytosis",
]
- case: oncokb:ros1:lung_adenocarcinoma
input:
prompt:
exact_spelling: "What is the consequence of the functional fusion of CD47 and ROS1 in Lung Adenocarcinoma?"
expected:
parts_of_query:
[
"https://demo.oncokb.org/api/v1/annotate/structuralVariants?",
"hugoSymbolA=CD74",
"hugoSymbolB=ROS1",
"structuralVariantType=FUSION",
"isFunctionalFusion=true",
"tumorType=Lung%20Adenocarcinoma",
]
Loading
Loading