Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Update MathBench & WikiBench for FullBench #1521

Merged
merged 7 commits into from
Sep 18, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update GPQA & MMLU_Pro
liushz committed Sep 14, 2024
commit b8640c102a2b90cb570abb27727dde24fa2afb4a
49 changes: 49 additions & 0 deletions opencompass/configs/datasets/gpqa/gpqa_ppl_2c9cd6.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import GPQADataset, GPQAEvaluator
from opencompass.utils import first_option_postprocess

gpqa_reader_cfg = dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer')

hint = f'对下面的单项选择题,请直接给出正确答案的选项。'
question_and_options = 'Question: {question}\n(A){A}\n(B){B}\n(C){C}\n(D){D}\n'
gpqa_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template={
opt: f'{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']},
),
prompt_template=dict(
type=PromptTemplate,
template={
opt: f'{hint}\n</E>{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']
},
ice_token='</E>'
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=PPLInferencer))

gpqa_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))

gpqa_datasets = []
gpqa_subsets = {
# 'extended': 'gpqa_extended.csv',
# 'main': 'gpqa_main.csv',
'diamond': 'gpqa_diamond.csv'
}

for split in list(gpqa_subsets.keys()):
gpqa_datasets.append(
dict(
abbr='GPQA_' + split,
type=GPQADataset,
path='./data/gpqa/',
name=gpqa_subsets[split],
reader_cfg=gpqa_reader_cfg,
infer_cfg=gpqa_infer_cfg,
eval_cfg=gpqa_eval_cfg)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MMLUProDataset, MMLUProBaseEvaluator

with read_base():
from .mmlu_pro_categories import categories

mmlu_pro_datasets = []

for category in categories:
hint = f'Answer the following multiple choice question about {category}, and give your answer option directly.'
question_and_options = 'Question:\n{question}\nOptions:\n{options_str}'
mmlu_pro_reader_cfg = dict(
input_columns=['question', 'cot_content', 'options_str'],
output_column='answer_string',
train_split='validation',
test_split='test',
)
mmlu_pro_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=f'{question_and_options}\nAnswer: {{answer}}'),
prompt_template=dict(
type=PromptTemplate,
template=f'{hint}\n</E>{question_and_options}\nAnswer: ',
ice_token='</E>'
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=GenInferencer, max_out_len=100)
)

mmlu_pro_eval_cfg = dict(
evaluator=dict(type=MMLUProBaseEvaluator)
)

mmlu_pro_datasets.append(
dict(
abbr=f'mmlu_pro_{category.replace(" ", "_")}',
type=MMLUProDataset,
path='opencompass/mmlu_pro',
category=category,
reader_cfg=mmlu_pro_reader_cfg,
infer_cfg=mmlu_pro_infer_cfg,
eval_cfg=mmlu_pro_eval_cfg,
))
46 changes: 44 additions & 2 deletions opencompass/datasets/mmlu_pro.py
Original file line number Diff line number Diff line change
@@ -3,19 +3,26 @@

from datasets import load_dataset

from opencompass.openicl import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path

from .base import BaseDataset

CHOICES=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']

def _parse(item):
choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']

s = ''
item['answer_string'] = ''
for i, opt in enumerate(item['options']):
if opt == 'N/A':
continue
s += '{}. {}\n'.format(choices[i], opt)
option = '{}. {}\n'.format(CHOICES[i], opt)
s += option
if item['answer'] == CHOICES[i]:
item['answer_string'] = option

item['options_str'] = s.strip()
item['cot_content'] = item['cot_content'].removeprefix("A: Let's think step by step.").strip()
return item
@@ -31,3 +38,38 @@ def load(path: str, category: str):
mmlu_pro = mmlu_pro.filter(lambda x: x['category'] == category)
mmlu_pro = mmlu_pro.map(_parse)
return mmlu_pro

class MMLUProBaseEvaluator(BaseEvaluator):

def is_equal(self, pred, refer):
try:
refer_option, refer_string = refer.split('. ')
if pred in CHOICES and refer_option == pred:
return True
elif refer_string.strip() == pred:
return True
else :
return False
except Exception:
pass
return False

def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
correct = 0
count = 0
details = []
for i, j in zip(predictions, references):
i = i.split('\n')[0].strip()
detail = {'pred': i, 'answer': j, 'correct': False}
count += 1
if self.is_equal(i, j):
correct += 1
detail['correct'] = True
details.append(detail)
result = {'accuracy': 100 * correct / count, 'details': details}
return result
Loading