Skip to content

Commit

Permalink
Merge pull request #157 from mehizli/main
Browse files Browse the repository at this point in the history
Merge medical benchmark into `dev` to integrate with other current developments, to be merged into `main` shortly
  • Loading branch information
slobentanzer authored Jul 18, 2024
2 parents 89ec5af + ad2070d commit 161d9fa
Show file tree
Hide file tree
Showing 65 changed files with 47,848 additions and 1,296 deletions.
3,718 changes: 3,718 additions & 0 deletions benchmark/Data_Analysis.ipynb

Large diffs are not rendered by default.

195 changes: 194 additions & 1 deletion benchmark/benchmark_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from datetime import datetime

import re
from nltk.corpus import wordnet
import pytest
import importlib_metadata

Expand Down Expand Up @@ -32,6 +33,9 @@ def benchmark_already_executed(
"""
task_results = return_or_create_result_file(task)

# check if failure group csv already exists
return_or_create_failure_mode_file(task)

if task_results.empty:
return False

Expand Down Expand Up @@ -99,6 +103,50 @@ def return_or_create_result_file(
return results


def return_or_create_failure_mode_file(task: str):
"""
Returns the failure mode file for the task or creates it if it does not
exist.
Args:
task (str): The benchmark task, e.g. "biocypher_query_generation"
Returns:
pd.DataFrame: The failure mode recording file for the task
"""
file_path = get_failure_mode_file_path(task)
try:
results = pd.read_csv(file_path, header=0)
except (pd.errors.EmptyDataError, FileNotFoundError):
results = pd.DataFrame(
columns=[
"model_name",
"subtask",
"actual_answer",
"expected_answer",
"failure_modes",
"md5_hash",
"datetime",
]
)
results.to_csv(file_path, index=False)
return results


def get_failure_mode_file_path(task: str) -> str:
"""
Returns the path to the failure mode recording file.
Args:
task (str): The benchmark task, e.g. "biocypher_query_generation"
Returns:
str: The path to the failure mode file
"""
return f"benchmark/results/{task}_failure_modes.csv"


def write_results_to_file(
model_name: str,
subtask: str,
Expand Down Expand Up @@ -130,6 +178,151 @@ def write_results_to_file(
results.to_csv(file_path, index=False)


def write_failure_modes_to_file(
model_name: str,
subtask: str,
actual_answer: str,
expected_answer: str,
failure_modes: str,
md5_hash: str,
file_path: str,
):
"""
Writes the failure modes identified for a given response to a subtask to
the given file path.
Args:
model_name (str): The model name, e.g. "gpt-3.5-turbo"
subtask (str): The benchmark subtask test case, e.g. "entities"
actual_answer (str): The actual response given to the subtask question
expected_answer (str): The expected response for the subtask
failure_modes (str): The mode of failure, e.g. "Wrong word count",
"Formatting", etc.
md5_hash (str): The md5 hash of the test case
file_path (str): The path to the result file
"""
results = pd.read_csv(file_path, header=0)
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
new_row = pd.DataFrame(
[
[
model_name,
subtask,
actual_answer,
expected_answer,
failure_modes,
md5_hash,
now,
]
],
columns=results.columns,
)
results = pd.concat([results, new_row], ignore_index=True).sort_values(
by=["model_name", "subtask"]
)
results.to_csv(file_path, index=False)


def categorize_failure_modes(
actual_answer, expected_answer, regex=False
) -> str:
"""
Categorises the mode of failure for a given response to a subtask.
Args:
actual_answer (str): The actual response given to the subtask question
expected_answer (str): The expected response for the subtask
regex (bool): Whether the expected answer is a regex expression
Returns:
str: The mode of failure, e.g. "Case Sensitivity", "Partial Match",
"Format Error", "Synonym", "Words Missing", "Entire Answer Incorrect",
"Other"
"""
if not regex:
# Check if the answer is right, but the case sensitivity was wrong (e.g. a / A)
if actual_answer.lower() == expected_answer.lower():
return "Case Sensitivity"

# Check if the wrong answer contains the expected answer followed by ")"
elif actual_answer.strip() == expected_answer + ")":
return "Format Error"

# Check if some of the answer is partially right, but only if it's more than one letter
elif len(expected_answer) > 1 and (actual_answer in expected_answer or expected_answer in actual_answer):
return "Partial Match"

# Check if the format of the answer is wrong, but the answer otherwise is right (e.g. "a b" instead of "ab")
elif re.sub(r"\s+", "", actual_answer.lower()) == re.sub(
r"\s+", "", expected_answer.lower()
):
return "Format Error"

# Check if the answer is a synonym with nltk (e.g. Illness / Sickness)
elif is_synonym(actual_answer, expected_answer):
return "Synonym"

# Check if the format of the answer is wrong due to numerical or alphabetic differences (e.g. "123" vs "one two three")
elif (
re.search(r"\w+", actual_answer)
and re.search(r"\w+", expected_answer)
and any(char.isdigit() for char in actual_answer)
!= any(char.isdigit() for char in expected_answer)
):
return "Format Error"

# Check if partial match with case sensitivity
elif (
actual_answer.lower() in expected_answer.lower()
or expected_answer.lower() in actual_answer.lower()
):
return "Partial Match / case Sensitivity"

# Else the answer may be completely wrong
else:
return "Other"

else:
# Check if all the words in actual_answer are expected but some of the expected are missing
if all(word in expected_answer for word in actual_answer.split()):
return "Words Missing"

# Check if some words in actual_answer are incorrect (present in actual_answer but not in expected_answer)
# elif any(word not in expected_answer for word in actual_answer.split()):
# return "Incorrect Words"

# Check if the entire actual_answer is completely different from the expected_answer
else:
return "Entire Answer Incorrect"


def is_synonym(word1, word2):
"""
Tests, if the input arguments word1 and word2 are synonyms of each other.
If yes, the function returns True, False otherwise.
"""
if word2 is "yes" or "no" or "ja" or "nein":
return False

synsets1 = wordnet.synsets(word1)
synsets2 = wordnet.synsets(word2)

for synset1 in synsets1:
for synset2 in synsets2:
if synset1.wup_similarity(synset2) is not None:
return True
return False


# TODO should we use SQLite? An online database (REDIS)?
def get_result_file_path(file_name: str) -> str:
"""Returns the path to the result file.
Expand Down
Loading

0 comments on commit 161d9fa

Please sign in to comment.