biocypher · slobentanzer · Jul 18, 2024 · Apr 2, 2024 · Apr 5, 2024 · Apr 5, 2024
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@ dist/
 __pycache__/
 .venv
 .pytest_cache
+venv/.env
 .env
 *.mp3
 .cache

diff --git a/benchmark/benchmark_utils.py b/benchmark/benchmark_utils.py
@@ -1,6 +1,8 @@
 import pytest
 
 import pandas as pd
+import re
+from nltk.corpus import wordnet
 from datetime import datetime
 
 
@@ -30,6 +32,9 @@ def benchmark_already_executed(
     """
     task_results = return_or_create_result_file(task)
 
+    # check if failure group csv already exists
+    return_or_create_wrong_result_file(task)
+
     if task_results.empty:
         return False
 
@@ -96,6 +101,47 @@ def return_or_create_result_file(
     return results
 
 
+def return_or_create_wrong_result_file(task: str):
+    """
+    Returns the wrong result file for the task or creates it if it does not exist.
+
+    Args:
+        task (str): The benchmark task, e.g. "biocypher_query_generation"
+
+    Returns:
+        pd.DataFrame: The wrong result file for the task
+    """
+    file_path = get_wrong_result_file_path(task)
+    try:
+        results = pd.read_csv(file_path, header=0)
+    except (pd.errors.EmptyDataError, FileNotFoundError):
+        results = pd.DataFrame(
+            columns=[
+                "model_name",
+                "subtask",
+                "wrong_answer",
+                "expected_answer",
+                "failure_groups",
+                "md5_hash",
+                "datetime",
+            ]
+        )
+        results.to_csv(file_path, index=False)
+    return results
+
+
+def get_wrong_result_file_path(task: str) -> str:
+    """Returns the path to the wrong result file.
+
+    Args:
+        task (str): The benchmark task, e.g. "biocypher_query_generation"
+
+    Returns:
+        str: The path to the wrong result file
+    """
+    return f"benchmark/results/{task}_failure_groups.csv"
+
+
 def write_results_to_file(
     model_name: str,
     subtask: str,
@@ -126,6 +172,98 @@ def write_results_to_file(
     results.to_csv(file_path, index=False)
 
 
+def write_wrong_results_to_file(
+    model_name: str,
+    subtask: str,
+    wrong_answer: str,
+    expected_answer: str,
+    failure_groups: str,
+    md5_hash: str,
+    file_path: str,
+):
+    """Writes the wrong benchmark results for the subtask to the result file.
+
+    Args:
+        model_name (str): The model name, e.g. "gpt-3.5-turbo"
+        subtask (str): The benchmark subtask test case, e.g. "entities"
+        wrong_answer (str): The wrong answer given to the subtask
+        expected_answer (str): The expected for the subtask
+        failure_groups (str): The group of the failure e.g. "Wrong count of words"
+        md5_hash (str): The md5 hash of the test case
+        file_path (str): The path to the result file
+    """
+    results = pd.read_csv(file_path, header=0)
+    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    new_row = pd.DataFrame(
+        [[model_name, subtask, wrong_answer, expected_answer, failure_groups, md5_hash, now]],
+        columns=results.columns,
+    )
+    results = pd.concat([results, new_row], ignore_index=True).sort_values(
+        by=["model_name", "subtask"]
+    )
+    results.to_csv(file_path, index=False)
+
+
+def categorize_failures(wrong_answer, expected_answer, regex=False):
+
+    if not regex:
+
+        # Check if the answer is right, but the case sensitivity was wrong (e.g. a / A)
+        if wrong_answer.lower() == expected_answer.lower():
+            return "Case Sensitivity"
+
+        # Check if some of the answer is right (e.g. "a headache instead of a")
+        elif wrong_answer in expected_answer or expected_answer in wrong_answer:
+            return "Partial Match"
+
+        # Check if the format of the answer is wrong, but the answer otherwise is right (e.g. "a b" instead of "ab")
+        elif re.sub(r'\s+', '', wrong_answer.lower()) == re.sub(r'\s+', '', expected_answer.lower()):
+            return "Format Error"
+
+        # Check if the answer is a synonym with nltk (e.g. Illness / Sickness)
+        elif is_synonym(wrong_answer, expected_answer):
+            return "Synonym"
+
+        # Check if the format of the answer is wrong due to numerical or alphabetic differences (e.g. "123" vs "one two three")
+        elif re.search(r'\w+', wrong_answer) and re.search(r'\w+', expected_answer) and any(char.isdigit() for char in wrong_answer) != any(char.isdigit() for char in expected_answer):
+            return "Format Error"
+
+        # Check if partial match with case sensitivity
+        elif wrong_answer.lower() in expected_answer.lower() or expected_answer.lower() in wrong_answer.lower():
+            return "Partial Match / case Sensitivity"
+
+        # Else the answer may be completely wrong
+        else:
+            return "Other"
+
+    else:
+        # Check if all the words in wrong_answer are expected but some of the expected are missing
+        if all(word in expected_answer for word in wrong_answer.split()):
+            return "Words Missing"
+
+        # Check if some words in wrong_answer are incorrect (present in wrong_answer but not in expected_answer)
+        #elif any(word not in expected_answer for word in wrong_answer.split()):
+        #   return "Incorrect Words"
+
+        # Check if the entire wrong_answer is completely different from the expected_answer
+        else:
+            return "Entire Answer Incorrect"
+
+
+def is_synonym(word1, word2):
+    if word2 is "yes" or "no" or "ja" or "nein":
+        return False
+
+    synsets1 = wordnet.synsets(word1)
+    synsets2 = wordnet.synsets(word2)
+
+    for synset1 in synsets1:
+        for synset2 in synsets2:
+            if synset1.wup_similarity(synset2) is not None:
+                return True
+    return False
+
+
 # TODO should we use SQLite? An online database (REDIS)?
 def get_result_file_path(file_name: str) -> str:
     """Returns the path to the result file.

diff --git a/benchmark/conftest.py b/benchmark/conftest.py
@@ -1,29 +1,28 @@
 import os
 
 import requests
+from dotenv import load_dotenv
 from xinference.client import Client
 import pytest
 
 import numpy as np
 import pandas as pd
 
 from biochatter.prompts import BioCypherPromptEngine
-from benchmark.load_dataset import get_benchmark_dataset
+from .load_dataset import get_benchmark_dataset
 from biochatter.llm_connect import GptConversation, XinferenceConversation
 from .benchmark_utils import benchmark_already_executed
 
 # how often should each benchmark be run?
-N_ITERATIONS = 5
+N_ITERATIONS = 1
 
 # which dataset should be used for benchmarking?
 BENCHMARK_DATASET = get_benchmark_dataset()
 
 # which models should be benchmarked?
 OPENAI_MODEL_NAMES = [
-    "gpt-3.5-turbo-0613",
-    "gpt-3.5-turbo-0125",
-    "gpt-4-0613",
-    "gpt-4-0125-preview",
+    "gpt-3.5-turbo-0125"
+    #"gpt-4-0613"
 ]
 
 XINFERENCE_MODELS = {
@@ -148,7 +147,7 @@
     for quantization in XINFERENCE_MODELS[model_name]["quantization"]
 ]
 
-BENCHMARKED_MODELS = OPENAI_MODEL_NAMES + XINFERENCE_MODEL_NAMES
+BENCHMARKED_MODELS = OPENAI_MODEL_NAMES #+ XINFERENCE_MODEL_NAMES
 BENCHMARKED_MODELS.sort()
 
 # Xinference IP and port
@@ -233,6 +232,9 @@ def conversation(request, model_name):
             prompts={},
             correct=False,
         )
+        # delete first dots if venv is in project env
+        cus_path = os.getcwd() + "../../venv/bin/.env"
+        load_dotenv(cus_path)
         conversation.set_api_key(
             os.getenv("OPENAI_API_KEY"), user="benchmark_user"
         )
@@ -304,6 +306,9 @@ def evaluation_conversation():
         prompts={},
         correct=False,
     )
+    # delete first dots if venv is in project env
+    cus_path = os.getcwd() + "../../venv/bin/.env"
+    load_dotenv(cus_path)
     conversation.set_api_key(os.getenv("OPENAI_API_KEY"), user="benchmark_user")
     return conversation
 
@@ -396,6 +401,12 @@ def pytest_generate_tests(metafunc):
             "test_data_text_extraction",
             data_file["text_extraction"],
         )
+    if "test_data_correctness" in metafunc.fixturenames:
+        metafunc.parametrize(
+            "test_data_correctness",
+            data_file["correctness"],
+        )
+
 
 
 @pytest.fixture