Skip to content

Commit f97c4ea

Browse files
authoredNov 26, 2024··
[Update] Update Fullbench (#1712)
* Update JuderBench * Support O1-style Prompts * Update Code
1 parent 300adc3 commit f97c4ea

22 files changed

+1147
-14
lines changed
 

‎configs/datasets/subjective/judgerbench/judgerbench.py

-5
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,3 @@
4747
infer_cfg=subjective_infer_cfg,
4848
eval_cfg=subjective_eval_cfg,
4949
))
50-
# ds1000_eval_cfg = dict(
51-
# evaluator=dict(type=DS1000Evaluator),
52-
# pred_role='BOT',
53-
# pred_postprocessor=dict(type=ds1000_postprocess),
54-
# )
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from opencompass.openicl.icl_prompt_template import PromptTemplate
2+
from opencompass.openicl.icl_retriever import ZeroRetriever
3+
from opencompass.openicl.icl_inferencer import GenInferencer
4+
from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
5+
6+
7+
aime2024_reader_cfg = dict(
8+
input_columns=['question'],
9+
output_column='answer'
10+
)
11+
12+
13+
aime2024_infer_cfg = dict(
14+
prompt_template=dict(
15+
type=PromptTemplate,
16+
template=dict(
17+
round=[
18+
dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
19+
],
20+
)
21+
),
22+
retriever=dict(type=ZeroRetriever),
23+
inferencer=dict(type=GenInferencer, max_out_len=2048)
24+
)
25+
26+
aime2024_eval_cfg = dict(
27+
evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2)
28+
)
29+
30+
aime2024_datasets = [
31+
dict(
32+
abbr='aime2024',
33+
type=Aime2024Dataset,
34+
path='opencompass/aime2024',
35+
reader_cfg=aime2024_reader_cfg,
36+
infer_cfg=aime2024_infer_cfg,
37+
eval_cfg=aime2024_eval_cfg
38+
)
39+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import os
2+
from opencompass.openicl.icl_prompt_template import PromptTemplate
3+
from opencompass.openicl.icl_retriever import ZeroRetriever
4+
from opencompass.openicl.icl_inferencer import GenInferencer
5+
from opencompass.openicl.icl_evaluator import AccEvaluator
6+
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
7+
8+
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
9+
10+
bbh_multiple_choice_sets = [
11+
'temporal_sequences',
12+
'disambiguation_qa',
13+
'date_understanding',
14+
'tracking_shuffled_objects_three_objects',
15+
'penguins_in_a_table',
16+
'geometric_shapes',
17+
'snarks',
18+
'ruin_names',
19+
'tracking_shuffled_objects_seven_objects',
20+
'tracking_shuffled_objects_five_objects',
21+
'logical_deduction_three_objects',
22+
'hyperbaton',
23+
'logical_deduction_five_objects',
24+
'logical_deduction_seven_objects',
25+
'movie_recommendation',
26+
'salient_translation_error_detection',
27+
'reasoning_about_colored_objects',
28+
]
29+
bbh_free_form_sets = [
30+
'multistep_arithmetic_two',
31+
'navigate',
32+
'dyck_languages',
33+
'word_sorting',
34+
'sports_understanding',
35+
'boolean_expressions',
36+
'object_counting',
37+
'formal_fallacies',
38+
'causal_judgement',
39+
'web_of_lies',
40+
]
41+
42+
bbh_datasets = []
43+
for _name in bbh_multiple_choice_sets:
44+
bbh_infer_cfg = dict(
45+
prompt_template=dict(
46+
type=PromptTemplate,
47+
template=dict(round=[
48+
dict(
49+
role='HUMAN',
50+
prompt=
51+
f"Follow the given examples and answer the question.\n\nQuestion: {{input}}\n You must give your final answer by starting with 'So the answer is' "
52+
)
53+
])),
54+
retriever=dict(type=ZeroRetriever),
55+
inferencer=dict(type=GenInferencer, max_out_len=512))
56+
bbh_eval_cfg = dict(
57+
evaluator=dict(type=BBHEvaluator_mcq),
58+
pred_role='BOT',
59+
pred_postprocessor=dict(type=bbh_mcq_postprocess),
60+
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
61+
62+
bbh_datasets.append(
63+
dict(
64+
type=BBHDataset,
65+
path='opencompass/bbh',
66+
name=_name,
67+
abbr='bbh-' + _name,
68+
reader_cfg=bbh_reader_cfg,
69+
infer_cfg=bbh_infer_cfg.copy(),
70+
eval_cfg=bbh_eval_cfg.copy()))
71+
72+
for _name in bbh_free_form_sets:
73+
74+
bbh_infer_cfg = dict(
75+
prompt_template=dict(
76+
type=PromptTemplate,
77+
template=dict(round=[
78+
dict(
79+
role='HUMAN',
80+
prompt=
81+
f"Follow the given examples and answer the question.\n\nQuestion: {{input}}\n You must give your final answer by starting with 'So the answer is' "
82+
)
83+
])),
84+
retriever=dict(type=ZeroRetriever),
85+
inferencer=dict(type=GenInferencer, max_out_len=512))
86+
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
87+
88+
bbh_datasets.append(
89+
dict(
90+
type=BBHDataset,
91+
path='opencompass/bbh',
92+
name=_name,
93+
abbr='bbh-' + _name,
94+
reader_cfg=bbh_reader_cfg,
95+
infer_cfg=bbh_infer_cfg.copy(),
96+
eval_cfg=bbh_eval_cfg.copy()))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import os
2+
from opencompass.openicl.icl_prompt_template import PromptTemplate
3+
from opencompass.openicl.icl_retriever import ZeroRetriever
4+
from opencompass.openicl.icl_inferencer import GenInferencer
5+
from opencompass.openicl.icl_evaluator import AccEvaluator
6+
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
7+
8+
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
9+
10+
bbh_multiple_choice_sets = [
11+
'temporal_sequences',
12+
'disambiguation_qa',
13+
'date_understanding',
14+
'tracking_shuffled_objects_three_objects',
15+
'penguins_in_a_table',
16+
'geometric_shapes',
17+
'snarks',
18+
'ruin_names',
19+
'tracking_shuffled_objects_seven_objects',
20+
'tracking_shuffled_objects_five_objects',
21+
'logical_deduction_three_objects',
22+
'hyperbaton',
23+
'logical_deduction_five_objects',
24+
'logical_deduction_seven_objects',
25+
'movie_recommendation',
26+
'salient_translation_error_detection',
27+
'reasoning_about_colored_objects',
28+
]
29+
bbh_free_form_sets = [
30+
'multistep_arithmetic_two',
31+
'navigate',
32+
'dyck_languages',
33+
'word_sorting',
34+
'sports_understanding',
35+
'boolean_expressions',
36+
'object_counting',
37+
'formal_fallacies',
38+
'causal_judgement',
39+
'web_of_lies',
40+
]
41+
42+
bbh_datasets = []
43+
for _name in bbh_multiple_choice_sets:
44+
bbh_infer_cfg = dict(
45+
prompt_template=dict(
46+
type=PromptTemplate,
47+
template=dict(round=[
48+
dict(
49+
role='HUMAN',
50+
prompt=
51+
f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
52+
)
53+
])),
54+
retriever=dict(type=ZeroRetriever),
55+
inferencer=dict(type=GenInferencer, max_out_len=512))
56+
bbh_eval_cfg = dict(
57+
evaluator=dict(type=BBHEvaluator_mcq),
58+
pred_role='BOT',
59+
pred_postprocessor=dict(type=bbh_mcq_postprocess),
60+
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
61+
62+
bbh_datasets.append(
63+
dict(
64+
type=BBHDataset,
65+
path='opencompass/bbh',
66+
name=_name,
67+
abbr='bbh-' + _name,
68+
reader_cfg=bbh_reader_cfg,
69+
infer_cfg=bbh_infer_cfg.copy(),
70+
eval_cfg=bbh_eval_cfg.copy()))
71+
72+
for _name in bbh_free_form_sets:
73+
74+
bbh_infer_cfg = dict(
75+
prompt_template=dict(
76+
type=PromptTemplate,
77+
template=dict(round=[
78+
dict(
79+
role='HUMAN',
80+
prompt=
81+
f"Follow the given examples and answer the question.\n\nQuestion: {{input}}\n You must give your final answer by starting with 'So the answer is' "
82+
)
83+
])),
84+
retriever=dict(type=ZeroRetriever),
85+
inferencer=dict(type=GenInferencer, max_out_len=512))
86+
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
87+
88+
bbh_datasets.append(
89+
dict(
90+
type=BBHDataset,
91+
path='opencompass/bbh',
92+
name=_name,
93+
abbr='bbh-' + _name,
94+
reader_cfg=bbh_reader_cfg,
95+
infer_cfg=bbh_infer_cfg.copy(),
96+
eval_cfg=bbh_eval_cfg.copy()))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from opencompass.openicl.icl_prompt_template import PromptTemplate
2+
from opencompass.openicl.icl_retriever import ZeroRetriever
3+
from opencompass.openicl.icl_inferencer import GenInferencer
4+
from opencompass.datasets import CMOFibDataset, MATHEvaluator, math_postprocess_v2
5+
6+
7+
cmo_fib_reader_cfg = dict(
8+
input_columns=['question'],
9+
output_column='answer'
10+
)
11+
12+
13+
cmo_fib_infer_cfg = dict(
14+
prompt_template=dict(
15+
type=PromptTemplate,
16+
template=dict(
17+
round=[
18+
dict(role='HUMAN', prompt='{question}\n你需要讲最终答案写入\\boxed{}.'),
19+
],
20+
)
21+
),
22+
retriever=dict(type=ZeroRetriever),
23+
inferencer=dict(type=GenInferencer, max_out_len=2048)
24+
)
25+
26+
cmo_fib_eval_cfg = dict(
27+
evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2)
28+
)
29+
30+
cmo_fib_datasets = [
31+
dict(
32+
abbr='cmo_fib',
33+
type=CMOFibDataset,
34+
path='opencompass/cmo_fib',
35+
reader_cfg=cmo_fib_reader_cfg,
36+
infer_cfg=cmo_fib_infer_cfg,
37+
eval_cfg=cmo_fib_eval_cfg
38+
)
39+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from opencompass.openicl.icl_prompt_template import PromptTemplate
2+
from opencompass.openicl.icl_retriever import ZeroRetriever
3+
from opencompass.openicl.icl_inferencer import GenInferencer
4+
from opencompass.datasets import GPQADataset, GPQA_Simple_Eval_postprocess, GPQAEvaluator
5+
6+
# openai_simple_eval prompt
7+
align_prompt = """
8+
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD.
9+
10+
{question}
11+
12+
A) {A}
13+
B) {B}
14+
C) {C}
15+
D) {D}
16+
""".strip()
17+
18+
gpqa_reader_cfg = dict(
19+
input_columns=['question', 'A', 'B', 'C', 'D'],
20+
output_column='answer')
21+
22+
gpqa_infer_cfg = dict(
23+
prompt_template=dict(
24+
type=PromptTemplate,
25+
template=dict(
26+
round=[
27+
dict(role='HUMAN', prompt=align_prompt),
28+
], )),
29+
retriever=dict(type=ZeroRetriever),
30+
inferencer=dict(type=GenInferencer))
31+
32+
gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator),
33+
pred_postprocessor=dict(type=GPQA_Simple_Eval_postprocess))
34+
35+
gpqa_datasets = []
36+
gpqa_subsets = {
37+
# 'extended': 'gpqa_extended.csv',
38+
# 'main': 'gpqa_main.csv',
39+
'diamond': 'gpqa_diamond.csv'
40+
}
41+
42+
for split in list(gpqa_subsets.keys()):
43+
gpqa_datasets.append(
44+
dict(
45+
abbr='GPQA_' + split,
46+
type=GPQADataset,
47+
path='./data/gpqa/',
48+
name=gpqa_subsets[split],
49+
reader_cfg=gpqa_reader_cfg,
50+
infer_cfg=gpqa_infer_cfg,
51+
eval_cfg=gpqa_eval_cfg)
52+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from opencompass.openicl.icl_prompt_template import PromptTemplate
2+
from opencompass.openicl.icl_retriever import ZeroRetriever
3+
from opencompass.openicl.icl_inferencer import GenInferencer
4+
from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
5+
from opencompass.datasets import MATHEvaluator, math_postprocess_v2
6+
7+
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
8+
9+
gsm8k_infer_cfg = dict(
10+
prompt_template=dict(
11+
type=PromptTemplate,
12+
template=dict(
13+
round=[
14+
dict(role='HUMAN', prompt='{question}\nPlease put your final answer within \\boxed{}.'),
15+
],
16+
),
17+
),
18+
retriever=dict(type=ZeroRetriever),
19+
inferencer=dict(type=GenInferencer, max_out_len=512),
20+
)
21+
22+
gsm8k_eval_cfg = dict(
23+
evaluator=dict(type=MATHEvaluator, version='v2'),
24+
pred_postprocessor=dict(type=math_postprocess_v2),
25+
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
26+
)
27+
28+
gsm8k_datasets = [
29+
dict(
30+
abbr='gsm8k',
31+
type=GSM8KDataset,
32+
path='opencompass/gsm8k',
33+
reader_cfg=gsm8k_reader_cfg,
34+
infer_cfg=gsm8k_infer_cfg,
35+
eval_cfg=gsm8k_eval_cfg,
36+
)
37+
]

0 commit comments

Comments
 (0)
Please sign in to comment.