Skip to content

Commit

Permalink
Merge pull request #18 from mehizli/stats_graphs
Browse files Browse the repository at this point in the history
Stats graphs
  • Loading branch information
ytehran authored Jul 16, 2024
2 parents 40086de + e753aec commit ad2070d
Show file tree
Hide file tree
Showing 5 changed files with 3,765 additions and 12 deletions.
3,718 changes: 3,718 additions & 0 deletions benchmark/Data_Analysis.ipynb

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions benchmark/benchmark_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,10 @@ def categorize_failure_modes(


def is_synonym(word1, word2):
"""
Tests, if the input arguments word1 and word2 are synonyms of each other.
If yes, the function returns True, False otherwise.
"""
if word2 is "yes" or "no" or "ja" or "nein":
return False

Expand Down
1 change: 0 additions & 1 deletion benchmark/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

# which models should be benchmarked?
OPENAI_MODEL_NAMES = [
# "gpt-3.5-turbo-0613",
"gpt-3.5-turbo-0125",
# "gpt-4-0613",
# "gpt-4-0125-preview",
Expand Down
13 changes: 12 additions & 1 deletion benchmark/test_user_interaction.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@ def test_medical_exam(
conversation,
multiple_testing,
):
"""Test medical exam data by the model.
The user input is a medical question with answer options. The system prompt
has the guidelines to answer the question, and the expected answer is the
information that the model should reply from the given question. If the case
contains the word 'regex', the test is successful if the extracted information
occures in the words in response. If it is a different question, the test is
successful if the extracted information matches the expected answer exactly.
For all false answers also calculate the failure mode of the answer.
"""
# Downloads the naturale language synonym toolkit, just need to be done once per device
# nltk.download()

Expand All @@ -40,13 +49,15 @@ def run_test():
nonlocal expected_answer
nonlocal failure_mode
conversation.reset() # needs to be reset for each test
# Define the system prompt
[
conversation.append_system_message(m)
for m in yaml_data["input"]["system_messages"]
]
# Define the user prompt
response, _, _ = conversation.query(yaml_data["input"]["prompt"])

# lower case, remove punctuation
# Set response to lower case and remove punctuation
response = (
response.lower().replace(".", "").replace("?", "").replace("!", "")
).strip()
Expand Down
41 changes: 31 additions & 10 deletions docs/scripts/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def on_pre_build(config, **kwargs) -> None:

result_files_path = "benchmark/results/"


result_file_names = [
f
for f in os.listdir(result_files_path)
Expand Down Expand Up @@ -52,23 +53,23 @@ def preprocess_results_for_frontend(
path (str): The path to the result files.
file_name (str): The file name of the result file.
"""
raw_results["score_possible"] = raw_results["score"].apply(
lambda x: float(x.split("/")[1])
raw_results["score_possible"] = raw_results.apply(
lambda x: float(x["score"].split("/")[1]) * x["iterations"], axis=1
)
raw_results["scores"] = raw_results["score"].apply(
lambda x: x.split("/")[0]
)
raw_results["score_achieved"] = raw_results["scores"].apply(
lambda x: np.mean(float(x.split(";")[0])) if ";" in x else float(x)
lambda x: np.sum([float(score) for score in x.split(";")]) if ";" in x else float(x)
)
raw_results["score_sd"] = raw_results["scores"].apply(
lambda x: np.std(float(x.split(";")[0])) if ";" in x else 0
lambda x: np.std([float(score) for score in x.split(";")], ddof=1) if ";" in x else 0
)
aggregated_scores = raw_results.groupby(["model_name"]).agg(
{
"score_possible": "sum",
"score_achieved": "sum",
"score_sd": "first",
"score_sd": "sum",
"iterations": "first",
}
)
Expand Down Expand Up @@ -116,23 +117,23 @@ def write_individual_extraction_task_results(raw_results: pd.DataFrame) -> None:
raw_results["subtask"] = raw_results["subtask"].apply(
lambda x: x.split(":")[1]
)
raw_results["score_possible"] = raw_results["score"].apply(
lambda x: float(x.split("/")[1])
raw_results["score_possible"] = raw_results.apply(
lambda x: float(x["score"].split("/")[1]) * x["iterations"], axis=1
)
raw_results["scores"] = raw_results["score"].apply(
lambda x: x.split("/")[0]
)
raw_results["score_achieved"] = raw_results["scores"].apply(
lambda x: np.mean(float(x.split(";")[0])) if ";" in x else float(x)
lambda x: np.sum([float(score) for score in x.split(";")]) if ";" in x else float(x)
)
raw_results["score_sd"] = raw_results["scores"].apply(
lambda x: np.std(float(x.split(";")[0])) if ";" in x else 0
lambda x: np.std([float(score) for score in x.split(";")], ddof=1) if ";" in x else 0
)
aggregated_scores = raw_results.groupby(["model_name", "subtask"]).agg(
{
"score_possible": "sum",
"score_achieved": "sum",
"score_sd": "first",
"score_sd": "mean",
"iterations": "first",
}
)
Expand Down Expand Up @@ -662,6 +663,26 @@ def plot_extraction_tasks():
sourcedata_info_extraction["score_sd"] = sourcedata_info_extraction[
"scores"
].apply(lambda x: np.std(float(x.split(";")[0])) if ";" in x else 0)
raw_results["score_possible"] = raw_results.apply(
lambda x: float(x["score"].split("/")[1]) * x["iterations"], axis=1
)
raw_results["scores"] = raw_results["score"].apply(
lambda x: x.split("/")[0]
)
raw_results["score_achieved"] = raw_results["scores"].apply(
lambda x: np.sum([float(score) for score in x.split(";")]) if ";" in x else float(x)
)
raw_results["score_sd"] = raw_results["scores"].apply(
lambda x: np.std([float(score) for score in x.split(";")], ddof=1) if ";" in x else 0
)
aggregated_scores = raw_results.groupby(["model_name"]).agg(
{
"score_possible": "sum",
"score_achieved": "sum",
"score_sd": "mean",
"iterations": "first",
}
)
aggregated_scores = sourcedata_info_extraction.groupby(
["model_name", "subtask"]
).agg(
Expand Down

0 comments on commit ad2070d

Please sign in to comment.