Skip to content

Commit 1bf8594

Browse files
xmshi-trio施晓明Leymore
authoredDec 9, 2023
[Feature] Add medbench (#678)
* update medbench * medbench update * format medbench * format --------- Co-authored-by: 施晓明 <PJLAB\[email protected]> Co-authored-by: Leymore <[email protected]>
1 parent 7cb53a9 commit 1bf8594

13 files changed

+1705
-2
lines changed
 

‎.pre-commit-config-zh-cn.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ exclude: |
55
opencompass/utils/internal/|
66
opencompass/openicl/icl_evaluator/hf_metrics/|
77
opencompass/datasets/lawbench/utils|
8-
opencompass/datasets/lawbench/evaluation_functions/
8+
opencompass/datasets/lawbench/evaluation_functions/|
9+
opencompass/datasets/medbench
910
)
1011
repos:
1112
- repo: https://gitee.com/openmmlab/mirrors-flake8

‎.pre-commit-config.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ exclude: |
55
opencompass/utils/internal/|
66
opencompass/openicl/icl_evaluator/hf_metrics/|
77
opencompass/datasets/lawbench/utils|
8-
opencompass/datasets/lawbench/evaluation_functions/
8+
opencompass/datasets/lawbench/evaluation_functions/|
9+
opencompass/datasets/medbench/
910
)
1011
repos:
1112
- repo: https://github.com/PyCQA/flake8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from mmengine.config import read_base
2+
3+
with read_base():
4+
from .medbench_gen_d44f24 import medbench_datasets # noqa: F401, F403
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
from opencompass.openicl.icl_prompt_template import PromptTemplate
2+
from opencompass.openicl.icl_retriever import ZeroRetriever
3+
from opencompass.openicl.icl_inferencer import GenInferencer
4+
from opencompass.openicl.icl_evaluator import AccEvaluator
5+
from opencompass.datasets import (
6+
MedBenchDataset,
7+
MedBenchEvaluator,
8+
MedBenchEvaluator_Cloze,
9+
MedBenchEvaluator_IE,
10+
MedBenchEvaluator_mcq,
11+
MedBenchEvaluator_CMeEE,
12+
MedBenchEvaluator_CMeIE,
13+
MedBenchEvaluator_CHIP_CDEE,
14+
MedBenchEvaluator_CHIP_CDN,
15+
MedBenchEvaluator_CHIP_CTC,
16+
MedBenchEvaluator_NLG,
17+
MedBenchEvaluator_TF,
18+
MedBenchEvaluator_EMR,
19+
)
20+
from opencompass.utils.text_postprocessors import first_capital_postprocess
21+
22+
medbench_reader_cfg = dict(
23+
input_columns=['problem_input'], output_column='label')
24+
25+
medbench_multiple_choices_sets = ['Health_exam', 'DDx-basic', 'DDx-advanced_pre', 'DDx-advanced_final', 'SafetyBench'] # 选择题,用acc判断
26+
27+
medbench_qa_sets = ['Health_Counseling', 'Medicine_Counseling', 'MedDG', 'MedSpeQA', 'MedTreat', 'CMB-Clin'] # 开放式QA,有标答
28+
29+
medbench_cloze_sets = ['Triage'] # 限定域QA,有标答
30+
31+
medbench_single_choice_sets = ['Medicine_attack'] # 正确与否判断,有标答
32+
33+
medbench_ie_sets = ['EMR', 'CMeEE'] # 判断识别的实体是否一致,用F1评价
34+
35+
#, 'CMeIE', 'CHIP_CDEE', 'CHIP_CDN', 'CHIP_CTC', 'Doc_parsing', 'MRG'
36+
37+
medbench_datasets = []
38+
39+
40+
for name in medbench_single_choice_sets:
41+
medbench_infer_cfg = dict(
42+
prompt_template=dict(
43+
type=PromptTemplate,
44+
template=dict(
45+
round=[dict(role="HUMAN", prompt='{problem_input}')])),
46+
retriever=dict(type=ZeroRetriever
47+
), # retriver 不起作用,以输入参数为准 (zero-shot / few-shot)
48+
inferencer=dict(type=GenInferencer))
49+
50+
medbench_eval_cfg = dict(
51+
evaluator=dict(type=MedBenchEvaluator_TF), pred_role="BOT")
52+
53+
medbench_datasets.append(
54+
dict(
55+
type=MedBenchDataset,
56+
path='./data/MedBench/' + name,
57+
name=name,
58+
abbr='medbench-' + name,
59+
setting_name='zero-shot',
60+
reader_cfg=medbench_reader_cfg,
61+
infer_cfg=medbench_infer_cfg.copy(),
62+
eval_cfg=medbench_eval_cfg.copy()))
63+
64+
for name in medbench_multiple_choices_sets:
65+
medbench_infer_cfg = dict(
66+
prompt_template=dict(
67+
type=PromptTemplate,
68+
template=dict(
69+
round=[dict(role="HUMAN", prompt='{problem_input}')])),
70+
retriever=dict(type=ZeroRetriever
71+
), # retriver 不起作用,以输入参数为准 (zero-shot / few-shot)
72+
inferencer=dict(type=GenInferencer))
73+
74+
medbench_eval_cfg = dict(
75+
evaluator=dict(type=MedBenchEvaluator), pred_role="BOT")
76+
77+
medbench_datasets.append(
78+
dict(
79+
type=MedBenchDataset,
80+
path='./data/MedBench/' + name,
81+
name=name,
82+
abbr='medbench-' + name,
83+
setting_name='zero-shot',
84+
reader_cfg=medbench_reader_cfg,
85+
infer_cfg=medbench_infer_cfg.copy(),
86+
eval_cfg=medbench_eval_cfg.copy()))
87+
88+
for name in medbench_qa_sets:
89+
medbench_infer_cfg = dict(
90+
prompt_template=dict(
91+
type=PromptTemplate,
92+
template=dict(
93+
round=[dict(role="HUMAN", prompt='{problem_input}')])),
94+
retriever=dict(type=ZeroRetriever
95+
), # retriver 不起作用,以输入参数为准 (zero-shot / few-shot)
96+
inferencer=dict(type=GenInferencer))
97+
98+
medbench_eval_cfg = dict(
99+
evaluator=dict(type=MedBenchEvaluator_NLG), pred_role="BOT")
100+
101+
medbench_datasets.append(
102+
dict(
103+
type=MedBenchDataset,
104+
path='./data/MedBench/' + name,
105+
name=name,
106+
abbr='medbench-' + name,
107+
setting_name='zero-shot',
108+
reader_cfg=medbench_reader_cfg,
109+
infer_cfg=medbench_infer_cfg.copy(),
110+
eval_cfg=medbench_eval_cfg.copy()))
111+
112+
for name in medbench_cloze_sets:
113+
medbench_infer_cfg = dict(
114+
prompt_template=dict(
115+
type=PromptTemplate,
116+
template=dict(
117+
round=[dict(role="HUMAN", prompt='{problem_input}')])),
118+
retriever=dict(type=ZeroRetriever
119+
), # retriver 不起作用,以输入参数为准 (zero-shot / few-shot)
120+
inferencer=dict(type=GenInferencer))
121+
122+
medbench_eval_cfg = dict(
123+
evaluator=dict(type=MedBenchEvaluator_Cloze), pred_role="BOT")
124+
125+
medbench_datasets.append(
126+
dict(
127+
type=MedBenchDataset,
128+
path='./data/MedBench/' + name,
129+
name=name,
130+
abbr='medbench-' + name,
131+
setting_name='zero-shot',
132+
reader_cfg=medbench_reader_cfg,
133+
infer_cfg=medbench_infer_cfg.copy(),
134+
eval_cfg=medbench_eval_cfg.copy()))
135+
136+
for name in medbench_ie_sets:
137+
medbench_infer_cfg = dict(
138+
prompt_template=dict(
139+
type=PromptTemplate,
140+
template=dict(
141+
round=[dict(role="HUMAN", prompt='{problem_input}')])),
142+
retriever=dict(type=ZeroRetriever
143+
), # retriver 不起作用,以输入参数为准 (zero-shot / few-shot)
144+
inferencer=dict(type=GenInferencer))
145+
146+
medbench_eval_cfg = dict(
147+
evaluator=dict(type=eval('MedBenchEvaluator_'+name)), pred_role="BOT")
148+
149+
medbench_datasets.append(
150+
dict(
151+
type=MedBenchDataset,
152+
path='./data/MedBench/' + name,
153+
name=name,
154+
abbr='medbench-' + name,
155+
setting_name='zero-shot',
156+
reader_cfg=medbench_reader_cfg,
157+
infer_cfg=medbench_infer_cfg.copy(),
158+
eval_cfg=medbench_eval_cfg.copy()))
159+
160+
del name, medbench_infer_cfg, medbench_eval_cfg

‎opencompass/datasets/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
from .math import * # noqa: F401, F403
5757
from .mathbench import * # noqa: F401, F403
5858
from .mbpp import * # noqa: F401, F403
59+
from .medbench import * # noqa: F401, F403
5960
from .mmlu import * # noqa: F401, F403
6061
from .multirc import * # noqa: F401, F403
6162
from .narrativeqa import * # noqa: F401, F403
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# flake8: noqa
2+
3+
from .medbench import * # noqa: F401, F403
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# flake8: noqa
2+
import pandas as pd
3+
4+
5+
class TaskSchema(object):
6+
7+
def __init__(self,
8+
passage=None,
9+
question=None,
10+
options=None,
11+
label=None,
12+
answer=None,
13+
other=None):
14+
self.passage = passage
15+
self.question = question
16+
self.options = options
17+
self.label = label
18+
self.answer = answer
19+
self.other = other
20+
21+
def to_dict(self):
22+
return {
23+
'passage': self.passage,
24+
'question': self.question,
25+
'options': self.options,
26+
'label': self.label,
27+
'answer': self.answer,
28+
'other': self.other
29+
}
30+
31+
32+
# define README.json
33+
class MedBenchInstance(object):
34+
35+
def __init__(self, task_description, data_source, task_schema, output,
36+
evaluation_metric, task_example):
37+
self.task_description = task_description
38+
self.data_source = data_source
39+
self.task_schema = task_schema
40+
self.output = output
41+
self.evaluation_metric = evaluation_metric
42+
self.task_example = task_example
43+
44+
def to_dict(self):
45+
return {
46+
'task description': self.task_description,
47+
'data source': self.data_source,
48+
'task schema': self.task_schema.to_dict(),
49+
'output': self.output,
50+
'evaluation metric': self.evaluation_metric,
51+
'task example': self.task_example
52+
}
53+
54+
55+
class ChatGPTSchema(object):
56+
57+
def __init__(self, context=None, metadata=''):
58+
self.context = context
59+
self.metadata = metadata
60+
61+
def to_dict(self):
62+
return {'context': self.context, 'metadata': self.metadata}
63+
64+
65+
class ResultsForHumanSchema(object):
66+
67+
def __init__(self,
68+
index,
69+
problem_input,
70+
label,
71+
model_input='',
72+
model_output='',
73+
parse_result='',
74+
first_stage_output='',
75+
second_stage_input='',
76+
is_correct=False):
77+
self.index = index
78+
self.problem_input = problem_input
79+
self.model_input = model_input
80+
self.model_output = model_output
81+
self.parse_result = parse_result
82+
self.label = label
83+
self.first_stage_output = first_stage_output
84+
self.second_stage_input = second_stage_input
85+
self.is_correct = is_correct
86+
87+
def to_dict(self):
88+
return {
89+
'index': self.index,
90+
'problem_input': self.problem_input,
91+
'model_input': self.model_input,
92+
'model_output': self.model_output,
93+
'parse_result': self.parse_result,
94+
'label': self.label,
95+
'is_correct': self.is_correct,
96+
'first_stage_output': self.first_stage_output,
97+
'second_stage_input': self.second_stage_input,
98+
}
99+
100+
@staticmethod
101+
def to_tsv(result_list, path):
102+
result_json = [item.to_dict() for item in result_list]
103+
table = pd.json_normalize(result_json)
104+
table.to_excel(path, index=False)

0 commit comments

Comments
 (0)
Please sign in to comment.