Skip to content

Commit c9a7026

Browse files
liushzliushz
and
liushz
authoredSep 18, 2024··
[Feature] Update MathBench & WikiBench for FullBench (#1521)
* Update MathBench & WikiBench for FullBench * Update MathBench & WikiBench for FullBench * Update GPQA & MMLU_Pro * Update MathBench & WikiBench for FullBench * Update MathBench & WikiBench for FullBench * Update MathBench & WikiBench for FullBench --------- Co-authored-by: liushz <[email protected]>
1 parent cfbd308 commit c9a7026

File tree

9 files changed

+426
-2
lines changed

9 files changed

+426
-2
lines changed
 
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
from mmengine.config import read_base
2+
from copy import deepcopy
3+
from opencompass.openicl.icl_prompt_template import PromptTemplate
4+
from opencompass.openicl.icl_retriever import ZeroRetriever
5+
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
6+
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
7+
from opencompass.datasets import MathBenchDataset, math_postprocess_v2
8+
from opencompass.utils.text_postprocessors import first_option_postprocess
9+
10+
with read_base():
11+
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
12+
13+
# Max for this dataset is 4
14+
num_shot = 0
15+
# Generate reasoning path or not, only for single choice
16+
with_reasoning = True
17+
# Use circular evaluation or not
18+
with_circular_eval = True
19+
# Use PPL mode in single choice test or not
20+
use_ppl_single_choice = False
21+
22+
assert 0 <= num_shot <= 4
23+
if num_shot == 0:
24+
prompts = zero_shot_prompts
25+
else:
26+
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
27+
28+
mathbench_datasets = []
29+
for _split in mathbench_sets:
30+
for _name in mathbench_sets[_split]:
31+
if 'single_choice' in _name:
32+
if with_reasoning:
33+
template_round = prompts[_name + '_with_reasoning']
34+
else:
35+
template_round = prompts[_name]
36+
else:
37+
template_round = prompts[_name]
38+
39+
if 'single_choice' in _name:
40+
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
41+
else:
42+
pred_postprocessor = dict(type=math_postprocess_v2)
43+
44+
if 'single_choice' in _name and with_circular_eval:
45+
evaluator = dict(type=CircularEvaluator)
46+
else:
47+
evaluator = dict(type=AccEvaluator)
48+
49+
# assemble the final config
50+
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
51+
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
52+
template = {}
53+
for answer in ['A', 'B', 'C', 'D']:
54+
one_template_round = deepcopy(template_round)
55+
one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
56+
template[answer] = dict(round=one_template_round)
57+
mathbench_infer_cfg = dict(
58+
prompt_template=dict(type=PromptTemplate, template=template),
59+
retriever=dict(type=ZeroRetriever),
60+
inferencer=dict(type=PPLInferencer),
61+
)
62+
else:
63+
mathbench_infer_cfg = dict(
64+
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
65+
retriever=dict(type=ZeroRetriever),
66+
inferencer=dict(type=GenInferencer, max_out_len=2048),
67+
)
68+
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
69+
70+
mathbench_datasets.append(
71+
dict(
72+
abbr='mathbench-' + _split + '-' + _name,
73+
type=MathBenchDataset,
74+
path=f'data/mathbench_v1/{_split}',
75+
name=_name,
76+
with_circular=with_circular_eval,
77+
reader_cfg=mathbench_reader_cfg,
78+
infer_cfg=mathbench_infer_cfg,
79+
eval_cfg=mathbench_eval_cfg,
80+
)
81+
)

‎configs/datasets/MathBench/mathbench_prompt.py

+6
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@
1111
'single_choice_en': [
1212
dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
1313
],
14+
'cloze_en': [
15+
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
16+
],
17+
'cloze_cn': [
18+
dict(role='HUMAN', prompt='{question}\n请一步一步推理,并在最后用\\boxed{}给出你的答案。'),
19+
]
1420
}
1521

1622
few_shot_prompts = {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
from opencompass.openicl.icl_prompt_template import PromptTemplate
2+
from opencompass.openicl.icl_retriever import ZeroRetriever
3+
from opencompass.openicl.icl_inferencer import GenInferencer
4+
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
5+
from opencompass.datasets import WikiBenchDataset
6+
from opencompass.utils.text_postprocessors import first_option_postprocess
7+
8+
9+
single_choice_prompts = {
10+
'single_choice_cn': '以下是一道单项选择题,请你根据你了解的知识一步步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。。\n下面是你要回答的题目:\n{question}\n让我们一步步推理:',
11+
}
12+
13+
wikibench_sets = {
14+
'wiki': ['single_choice_cn'],
15+
}
16+
17+
do_circular = True
18+
19+
wikibench_datasets = []
20+
21+
for _split in list(wikibench_sets.keys()):
22+
for _name in wikibench_sets[_split]:
23+
wikibench_infer_cfg = dict(
24+
ice_template=dict(
25+
type=PromptTemplate,
26+
template=dict(
27+
begin='</E>',
28+
round=[
29+
dict(role='HUMAN', prompt=single_choice_prompts[_name]),
30+
dict(role='BOT', prompt='{answer}'),
31+
],
32+
),
33+
ice_token='</E>',
34+
),
35+
retriever=dict(type=ZeroRetriever),
36+
inferencer=dict(type=GenInferencer),
37+
)
38+
wikibench_eval_cfg = dict(
39+
evaluator=dict(type=CircularEvaluator if do_circular else AccEvaluator),
40+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
41+
)
42+
43+
wikibench_datasets.append(
44+
dict(
45+
type=WikiBenchDataset,
46+
path=f'./data/WikiBench/{_name}.jsonl',
47+
name='circular_' + _name if do_circular else _name,
48+
abbr='wikibench-' + _split + '-' + _name + 'circular' if do_circular else '',
49+
reader_cfg=dict(
50+
input_columns=['question'],
51+
output_column='answer',
52+
),
53+
infer_cfg=wikibench_infer_cfg,
54+
eval_cfg=wikibench_eval_cfg,
55+
)
56+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
from mmengine.config import read_base
2+
from copy import deepcopy
3+
from opencompass.openicl.icl_prompt_template import PromptTemplate
4+
from opencompass.openicl.icl_retriever import ZeroRetriever
5+
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
6+
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
7+
from opencompass.datasets import MathBenchDataset, math_postprocess_v2
8+
from opencompass.utils.text_postprocessors import first_option_postprocess
9+
10+
with read_base():
11+
from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
12+
13+
# Max for this dataset is 4
14+
num_shot = 0
15+
# Generate reasoning path or not, only for single choice
16+
with_reasoning = True
17+
# Use circular evaluation or not
18+
with_circular_eval = True
19+
# Use PPL mode in single choice test or not
20+
use_ppl_single_choice = False
21+
22+
assert 0 <= num_shot <= 4
23+
if num_shot == 0:
24+
prompts = zero_shot_prompts
25+
else:
26+
prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
27+
28+
mathbench_datasets = []
29+
for _split in mathbench_sets:
30+
for _name in mathbench_sets[_split]:
31+
if 'single_choice' in _name:
32+
if with_reasoning:
33+
template_round = prompts[_name + '_with_reasoning']
34+
else:
35+
template_round = prompts[_name]
36+
else:
37+
template_round = prompts[_name]
38+
39+
if 'single_choice' in _name:
40+
pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
41+
else:
42+
pred_postprocessor = dict(type=math_postprocess_v2)
43+
44+
if 'single_choice' in _name and with_circular_eval:
45+
evaluator = dict(type=CircularEvaluator)
46+
else:
47+
evaluator = dict(type=AccEvaluator)
48+
49+
# assemble the final config
50+
mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
51+
if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
52+
template = {}
53+
for answer in ['A', 'B', 'C', 'D']:
54+
one_template_round = deepcopy(template_round)
55+
one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
56+
template[answer] = dict(round=one_template_round)
57+
mathbench_infer_cfg = dict(
58+
prompt_template=dict(type=PromptTemplate, template=template),
59+
retriever=dict(type=ZeroRetriever),
60+
inferencer=dict(type=PPLInferencer),
61+
)
62+
else:
63+
mathbench_infer_cfg = dict(
64+
prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
65+
retriever=dict(type=ZeroRetriever),
66+
inferencer=dict(type=GenInferencer, max_out_len=2048),
67+
)
68+
mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
69+
70+
mathbench_datasets.append(
71+
dict(
72+
abbr='mathbench-' + _split + '-' + _name,
73+
type=MathBenchDataset,
74+
path=f'data/mathbench_v1/{_split}',
75+
name=_name,
76+
with_circular=with_circular_eval,
77+
reader_cfg=mathbench_reader_cfg,
78+
infer_cfg=mathbench_infer_cfg,
79+
eval_cfg=mathbench_eval_cfg,
80+
)
81+
)

‎opencompass/configs/datasets/MathBench/mathbench_prompt.py

+6
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@
1111
'single_choice_en': [
1212
dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
1313
],
14+
'cloze_en': [
15+
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
16+
],
17+
'cloze_cn': [
18+
dict(role='HUMAN', prompt='{question}\n请一步一步推理,并在最后用\\boxed{}给出你的答案。'),
19+
]
1420
}
1521

1622
few_shot_prompts = {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
from opencompass.openicl.icl_prompt_template import PromptTemplate
2+
from opencompass.openicl.icl_retriever import FixKRetriever
3+
from opencompass.openicl.icl_inferencer import PPLInferencer
4+
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
5+
from opencompass.datasets import GPQADataset, GPQAEvaluator
6+
from opencompass.utils import first_option_postprocess
7+
8+
gpqa_reader_cfg = dict(
9+
input_columns=['question', 'A', 'B', 'C', 'D'],
10+
output_column='answer')
11+
12+
hint = f'对下面的单项选择题,请直接给出正确答案的选项。'
13+
question_and_options = 'Question: {question}\n(A){A}\n(B){B}\n(C){C}\n(D){D}\n'
14+
gpqa_infer_cfg = dict(
15+
ice_template=dict(
16+
type=PromptTemplate,
17+
template={
18+
opt: f'{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']},
19+
),
20+
prompt_template=dict(
21+
type=PromptTemplate,
22+
template={
23+
opt: f'{hint}\n</E>{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']
24+
},
25+
ice_token='</E>'
26+
),
27+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
28+
inferencer=dict(type=PPLInferencer))
29+
30+
gpqa_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
31+
32+
gpqa_datasets = []
33+
gpqa_subsets = {
34+
# 'extended': 'gpqa_extended.csv',
35+
# 'main': 'gpqa_main.csv',
36+
'diamond': 'gpqa_diamond.csv'
37+
}
38+
39+
for split in list(gpqa_subsets.keys()):
40+
gpqa_datasets.append(
41+
dict(
42+
abbr='GPQA_' + split,
43+
type=GPQADataset,
44+
path='./data/gpqa/',
45+
name=gpqa_subsets[split],
46+
reader_cfg=gpqa_reader_cfg,
47+
infer_cfg=gpqa_infer_cfg,
48+
eval_cfg=gpqa_eval_cfg)
49+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from mmengine.config import read_base
2+
from opencompass.openicl.icl_prompt_template import PromptTemplate
3+
from opencompass.openicl.icl_retriever import FixKRetriever
4+
from opencompass.openicl.icl_inferencer import GenInferencer
5+
from opencompass.datasets import MMLUProDataset, MMLUProBaseEvaluator
6+
7+
with read_base():
8+
from .mmlu_pro_categories import categories
9+
10+
mmlu_pro_datasets = []
11+
12+
for category in categories:
13+
hint = f'Answer the following multiple choice question about {category}, and give your answer option directly.'
14+
question_and_options = 'Question:\n{question}\nOptions:\n{options_str}'
15+
mmlu_pro_reader_cfg = dict(
16+
input_columns=['question', 'cot_content', 'options_str'],
17+
output_column='answer_string',
18+
train_split='validation',
19+
test_split='test',
20+
)
21+
mmlu_pro_infer_cfg = dict(
22+
ice_template=dict(
23+
type=PromptTemplate,
24+
template=f'{question_and_options}\nAnswer: {{answer}}'),
25+
prompt_template=dict(
26+
type=PromptTemplate,
27+
template=f'{hint}\n</E>{question_and_options}\nAnswer: ',
28+
ice_token='</E>'
29+
),
30+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
31+
inferencer=dict(type=GenInferencer, max_out_len=100)
32+
)
33+
34+
mmlu_pro_eval_cfg = dict(
35+
evaluator=dict(type=MMLUProBaseEvaluator)
36+
)
37+
38+
mmlu_pro_datasets.append(
39+
dict(
40+
abbr=f'mmlu_pro_{category.replace(" ", "_")}',
41+
type=MMLUProDataset,
42+
path='opencompass/mmlu_pro',
43+
category=category,
44+
reader_cfg=mmlu_pro_reader_cfg,
45+
infer_cfg=mmlu_pro_infer_cfg,
46+
eval_cfg=mmlu_pro_eval_cfg,
47+
))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
from opencompass.openicl.icl_prompt_template import PromptTemplate
2+
from opencompass.openicl.icl_retriever import ZeroRetriever
3+
from opencompass.openicl.icl_inferencer import GenInferencer
4+
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
5+
from opencompass.datasets import WikiBenchDataset
6+
from opencompass.utils.text_postprocessors import first_option_postprocess
7+
8+
9+
single_choice_prompts = {
10+
'single_choice_cn': '以下是一道单项选择题,请你根据你了解的知识一步步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。。\n下面是你要回答的题目:\n{question}\n让我们一步步推理:',
11+
}
12+
13+
wikibench_sets = {
14+
'wiki': ['single_choice_cn'],
15+
}
16+
17+
do_circular = True
18+
19+
wikibench_datasets = []
20+
21+
for _split in list(wikibench_sets.keys()):
22+
for _name in wikibench_sets[_split]:
23+
wikibench_infer_cfg = dict(
24+
ice_template=dict(
25+
type=PromptTemplate,
26+
template=dict(
27+
begin='</E>',
28+
round=[
29+
dict(role='HUMAN', prompt=single_choice_prompts[_name]),
30+
dict(role='BOT', prompt='{answer}'),
31+
],
32+
),
33+
ice_token='</E>',
34+
),
35+
retriever=dict(type=ZeroRetriever),
36+
inferencer=dict(type=GenInferencer),
37+
)
38+
wikibench_eval_cfg = dict(
39+
evaluator=dict(type=CircularEvaluator if do_circular else AccEvaluator),
40+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
41+
)
42+
43+
wikibench_datasets.append(
44+
dict(
45+
type=WikiBenchDataset,
46+
path=f'./data/WikiBench/{_name}.jsonl',
47+
name='circular_' + _name if do_circular else _name,
48+
abbr='wikibench-' + _split + '-' + _name + 'circular' if do_circular else '',
49+
reader_cfg=dict(
50+
input_columns=['question'],
51+
output_column='answer',
52+
),
53+
infer_cfg=wikibench_infer_cfg,
54+
eval_cfg=wikibench_eval_cfg,
55+
)
56+
)

‎opencompass/datasets/mmlu_pro.py

+44-2
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,26 @@
33

44
from datasets import load_dataset
55

6+
from opencompass.openicl import BaseEvaluator
67
from opencompass.registry import LOAD_DATASET
78
from opencompass.utils import get_data_path
89

910
from .base import BaseDataset
1011

12+
CHOICES=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']
1113

1214
def _parse(item):
13-
choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']
15+
1416
s = ''
17+
item['answer_string'] = ''
1518
for i, opt in enumerate(item['options']):
1619
if opt == 'N/A':
1720
continue
18-
s += '{}. {}\n'.format(choices[i], opt)
21+
option = '{}. {}\n'.format(CHOICES[i], opt)
22+
s += option
23+
if item['answer'] == CHOICES[i]:
24+
item['answer_string'] = option
25+
1926
item['options_str'] = s.strip()
2027
item['cot_content'] = item['cot_content'].removeprefix("A: Let's think step by step.").strip()
2128
return item
@@ -31,3 +38,38 @@ def load(path: str, category: str):
3138
mmlu_pro = mmlu_pro.filter(lambda x: x['category'] == category)
3239
mmlu_pro = mmlu_pro.map(_parse)
3340
return mmlu_pro
41+
42+
class MMLUProBaseEvaluator(BaseEvaluator):
43+
44+
def is_equal(self, pred, refer):
45+
try:
46+
refer_option, refer_string = refer.split('. ')
47+
if pred in CHOICES and refer_option == pred:
48+
return True
49+
elif refer_string.strip() == pred:
50+
return True
51+
else :
52+
return False
53+
except Exception:
54+
pass
55+
return False
56+
57+
def score(self, predictions, references):
58+
if len(predictions) != len(references):
59+
return {
60+
'error': 'predictions and references have different '
61+
'length'
62+
}
63+
correct = 0
64+
count = 0
65+
details = []
66+
for i, j in zip(predictions, references):
67+
i = i.split('\n')[0].strip()
68+
detail = {'pred': i, 'answer': j, 'correct': False}
69+
count += 1
70+
if self.is_equal(i, j):
71+
correct += 1
72+
detail['correct'] = True
73+
details.append(detail)
74+
result = {'accuracy': 100 * correct / count, 'details': details}
75+
return result

0 commit comments

Comments
 (0)
Please sign in to comment.