Merge pull request #18 from mehizli/stats_graphs

Stats graphs
biocypher · Jul 16, 2024 · ad2070d · ad2070d
2 parents 40086de + e753aec
commit ad2070d
Show file tree

Hide file tree

Showing 5 changed files with 3,765 additions and 12 deletions.
diff --git a/benchmark/Data_Analysis.ipynb b/benchmark/Data_Analysis.ipynb
diff --git a/benchmark/benchmark_utils.py b/benchmark/benchmark_utils.py
@@ -306,6 +306,10 @@ def categorize_failure_modes(
 
 
 def is_synonym(word1, word2):
+    """
+    Tests, if the input arguments word1 and word2 are synonyms of each other.
+    If yes, the function returns True, False otherwise. 
+    """
     if word2 is "yes" or "no" or "ja" or "nein":
         return False
 

diff --git a/benchmark/conftest.py b/benchmark/conftest.py
@@ -20,7 +20,6 @@
 
 # which models should be benchmarked?
 OPENAI_MODEL_NAMES = [
-    # "gpt-3.5-turbo-0613",
     "gpt-3.5-turbo-0125",
     # "gpt-4-0613",
     # "gpt-4-0125-preview",

diff --git a/benchmark/test_user_interaction.py b/benchmark/test_user_interaction.py
@@ -22,6 +22,15 @@ def test_medical_exam(
     conversation,
     multiple_testing,
 ):
+    """Test medical exam data by the model.
+    The user input is a medical question with answer options. The system prompt 
+    has the guidelines to answer the question, and the expected answer is the 
+    information that the model should reply from the given question. If the case 
+    contains the word 'regex', the test is successful if the extracted information 
+    occures in the words in response. If it is a different question, the test is 
+    successful if the extracted information matches the expected answer exactly.
+    For all false answers also calculate the failure mode of the answer. 
+    """
     # Downloads the naturale language synonym toolkit, just need to be done once per device
     # nltk.download()
 
@@ -40,13 +49,15 @@ def run_test():
         nonlocal expected_answer
         nonlocal failure_mode
         conversation.reset()  # needs to be reset for each test
+        # Define the system prompt
         [
             conversation.append_system_message(m)
             for m in yaml_data["input"]["system_messages"]
         ]
+        # Define the user prompt
         response, _, _ = conversation.query(yaml_data["input"]["prompt"])
 
-        # lower case, remove punctuation
+        # Set response to lower case and remove punctuation
         response = (
             response.lower().replace(".", "").replace("?", "").replace("!", "")
         ).strip()

diff --git a/docs/scripts/hooks.py b/docs/scripts/hooks.py
@@ -17,6 +17,7 @@ def on_pre_build(config, **kwargs) -> None:
 
     result_files_path = "benchmark/results/"
 
+
     result_file_names = [
         f
         for f in os.listdir(result_files_path)
@@ -52,23 +53,23 @@ def preprocess_results_for_frontend(
         path (str): The path to the result files.
         file_name (str): The file name of the result file.
     """
-    raw_results["score_possible"] = raw_results["score"].apply(
-        lambda x: float(x.split("/")[1])
+    raw_results["score_possible"] = raw_results.apply(
+        lambda x: float(x["score"].split("/")[1]) * x["iterations"], axis=1
     )
     raw_results["scores"] = raw_results["score"].apply(
         lambda x: x.split("/")[0]
     )
     raw_results["score_achieved"] = raw_results["scores"].apply(
-        lambda x: np.mean(float(x.split(";")[0])) if ";" in x else float(x)
+        lambda x: np.sum([float(score) for score in x.split(";")]) if ";" in x else float(x)
     )
     raw_results["score_sd"] = raw_results["scores"].apply(
-        lambda x: np.std(float(x.split(";")[0])) if ";" in x else 0
+         lambda x: np.std([float(score) for score in x.split(";")], ddof=1) if ";" in x else 0
     )
     aggregated_scores = raw_results.groupby(["model_name"]).agg(
         {
             "score_possible": "sum",
             "score_achieved": "sum",
-            "score_sd": "first",
+            "score_sd": "sum",
             "iterations": "first",
         }
     )
@@ -116,23 +117,23 @@ def write_individual_extraction_task_results(raw_results: pd.DataFrame) -> None:
     raw_results["subtask"] = raw_results["subtask"].apply(
         lambda x: x.split(":")[1]
     )
-    raw_results["score_possible"] = raw_results["score"].apply(
-        lambda x: float(x.split("/")[1])
+    raw_results["score_possible"] = raw_results.apply(
+        lambda x: float(x["score"].split("/")[1]) * x["iterations"], axis=1
     )
     raw_results["scores"] = raw_results["score"].apply(
         lambda x: x.split("/")[0]
     )
     raw_results["score_achieved"] = raw_results["scores"].apply(
-        lambda x: np.mean(float(x.split(";")[0])) if ";" in x else float(x)
+        lambda x: np.sum([float(score) for score in x.split(";")]) if ";" in x else float(x)
     )
     raw_results["score_sd"] = raw_results["scores"].apply(
-        lambda x: np.std(float(x.split(";")[0])) if ";" in x else 0
+         lambda x: np.std([float(score) for score in x.split(";")], ddof=1) if ";" in x else 0
     )
     aggregated_scores = raw_results.groupby(["model_name", "subtask"]).agg(
         {
             "score_possible": "sum",
             "score_achieved": "sum",
-            "score_sd": "first",
+            "score_sd": "mean",
             "iterations": "first",
         }
     )
@@ -662,6 +663,26 @@ def plot_extraction_tasks():
     sourcedata_info_extraction["score_sd"] = sourcedata_info_extraction[
         "scores"
     ].apply(lambda x: np.std(float(x.split(";")[0])) if ";" in x else 0)
+    raw_results["score_possible"] = raw_results.apply(
+        lambda x: float(x["score"].split("/")[1]) * x["iterations"], axis=1
+    )
+    raw_results["scores"] = raw_results["score"].apply(
+        lambda x: x.split("/")[0]
+    )
+    raw_results["score_achieved"] = raw_results["scores"].apply(
+        lambda x: np.sum([float(score) for score in x.split(";")]) if ";" in x else float(x)
+    )
+    raw_results["score_sd"] = raw_results["scores"].apply(
+         lambda x: np.std([float(score) for score in x.split(";")], ddof=1) if ";" in x else 0
+    )
+    aggregated_scores = raw_results.groupby(["model_name"]).agg(
+        {
+            "score_possible": "sum",
+            "score_achieved": "sum",
+            "score_sd": "mean",
+            "iterations": "first",
+        }
+    )
     aggregated_scores = sourcedata_info_extraction.groupby(
         ["model_name", "subtask"]
     ).agg(