-
Notifications
You must be signed in to change notification settings - Fork 115
/
eval-harness.sh
executable file
·137 lines (108 loc) · 7.15 KB
/
eval-harness.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/bash
LM_EVALUATION_HARNESS_PATH="../lm-evaluation-harness"
ARGS="--model=hf-causal-experimental --batch_size 2"
MODEL_ARGS="use_accelerate=True,dtype=bfloat16,trust_remote_code=True"
ARC="--tasks=arc_challenge --num_fewshot=25"
HELLASWAG="--tasks=hellaswag --num_fewshot=10"
TRUTHFULQA="--tasks=truthfulqa_mc --num_fewshot=0"
MMLU="--tasks=hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions --num_fewshot=5"
### ARC-Challenge
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${ARC} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-7b-64k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-7b-64k-arc.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${ARC} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-7b-128k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-7b-128k-arc.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${ARC} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-13b-64k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-13b-64k-arc.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${ARC} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-13b-128k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-13b-128k-arc.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${ARC} \
--model_args="pretrained=NousResearch/Yarn-Mistral-7b-64k,${MODEL_ARGS}" \
--output_path="data/Yarn-Mistral-7b-64k-arc.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${ARC} \
--model_args="pretrained=NousResearch/Yarn-Mistral-7b-128k,${MODEL_ARGS}" \
--output_path="data/Yarn-Mistral-7b-128k-arc.json"
### Hellaswag
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${HELLASWAG} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-7b-64k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-7b-64k-hellaswag.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${HELLASWAG} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-7b-128k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-7b-128k-hellaswag.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${HELLASWAG} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-13b-64k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-13b-64k-hellaswag.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${HELLASWAG} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-13b-128k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-13b-128k-hellaswag.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${HELLASWAG} \
--model_args="pretrained=NousResearch/Yarn-Mistral-7b-64k,${MODEL_ARGS}" \
--output_path="data/Yarn-Mistral-7b-64k-hellaswag.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${HELLASWAG} \
--model_args="pretrained=NousResearch/Yarn-Mistral-7b-128k,${MODEL_ARGS}" \
--output_path="data/Yarn-Mistral-7b-128k-hellaswag.json"
### MMLU
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${MMLU} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-7b-64k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-7b-64k-mmlu.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${MMLU} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-7b-128k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-7b-128k-mmlu.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${MMLU} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-13b-64k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-13b-64k-mmlu.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${MMLU} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-13b-128k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-13b-128k-mmlu.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${MMLU} \
--model_args="pretrained=NousResearch/Yarn-Mistral-7b-64k,${MODEL_ARGS}" \
--output_path="data/Yarn-Mistral-7b-64k-mmlu.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${MMLU} \
--model_args="pretrained=NousResearch/Yarn-Mistral-7b-128k,${MODEL_ARGS}" \
--output_path="data/Yarn-Mistral-7b-128k-mmlu.json"
## TruthfulQA
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${TRUTHFULQA} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-7b-64k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-7b-64k-truthfulqa.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${TRUTHFULQA} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-7b-128k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-7b-128k-truthfulqa.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${TRUTHFULQA} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-13b-64k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-13b-64k-truthfulqa.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${TRUTHFULQA} \
--model_args="pretrained=NousResearch/Yarn-Llama-2-13b-128k,${MODEL_ARGS}" \
--output_path="data/Yarn-Llama-2-13b-128k-truthfulqa.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${TRUTHFULQA} \
--model_args="pretrained=NousResearch/Yarn-Mistral-7b-64k,${MODEL_ARGS}" \
--output_path="data/Yarn-Mistral-7b-64k-truthfulqa.json"
python ${LM_EVALUATION_HARNESS_PATH}/main.py ${ARGS} \
${TRUTHFULQA} \
--model_args="pretrained=NousResearch/Yarn-Mistral-7b-128k,${MODEL_ARGS}" \
--output_path="data/Yarn-Mistral-7b-128k-truthfulqa.json"