|
| 1 | +from mmengine.config import read_base |
| 2 | +import os.path as osp |
| 3 | +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner |
| 4 | +from opencompass.runners import LocalRunner |
| 5 | +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask |
| 6 | + |
| 7 | + |
| 8 | +####################################################################### |
| 9 | +# PART 0 Essential Configs # |
| 10 | +####################################################################### |
| 11 | +with read_base(): |
| 12 | + # Datasets Part |
| 13 | + ## Core Set |
| 14 | + # ## Examination |
| 15 | + from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets |
| 16 | + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets |
| 17 | + from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import cmmlu_datasets |
| 18 | + # ## Reasoning |
| 19 | + from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets |
| 20 | + from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets |
| 21 | + # ## Math |
| 22 | + from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets |
| 23 | + # ## Coding |
| 24 | + from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets |
| 25 | + # ## Instruction Following |
| 26 | + from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets |
| 27 | + |
| 28 | + # Summarizer |
| 29 | + from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups |
| 30 | + from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups |
| 31 | + from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups |
| 32 | + from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups |
| 33 | + |
| 34 | + |
| 35 | + # Model List |
| 36 | + # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model |
| 37 | + # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model |
| 38 | + # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model |
| 39 | + # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model |
| 40 | + # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model |
| 41 | + # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model |
| 42 | + |
| 43 | +####################################################################### |
| 44 | +# PART 1 Datasets List # |
| 45 | +####################################################################### |
| 46 | +# datasets list for evaluation |
| 47 | +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) |
| 48 | + |
| 49 | + |
| 50 | +####################################################################### |
| 51 | +# PART 2 Datset Summarizer # |
| 52 | +####################################################################### |
| 53 | +# with read_base(): |
| 54 | + |
| 55 | +core_summary_groups = [ |
| 56 | + { |
| 57 | + 'name': 'core_average', |
| 58 | + 'subsets': [ |
| 59 | + ['mmlu', 'accuracy'], |
| 60 | + ['mmlu_pro', 'accuracy'], |
| 61 | + # ['cmmlu', 'naive_average'], |
| 62 | + ['cmmlu', 'accuracy'], |
| 63 | + ['bbh', 'score'], |
| 64 | + ['math', 'accuracy'], |
| 65 | + ['openai_humaneval', 'humaneval_pass@1'], |
| 66 | + ['GPQA_diamond', 'accuracy'], |
| 67 | + ['IFEval', 'Prompt-level-strict-accuracy'], |
| 68 | + ], |
| 69 | + }, |
| 70 | +] |
| 71 | + |
| 72 | +summarizer = dict( |
| 73 | + dataset_abbrs=[ |
| 74 | + ['core_average', 'naive_average'], |
| 75 | + ['mmlu', 'accuracy'], |
| 76 | + ['mmlu_pro', 'accuracy'], |
| 77 | + ['cmmlu', 'accuracy'], |
| 78 | + ['bbh', 'score'], |
| 79 | + ['math', 'accuracy'], |
| 80 | + ['openai_humaneval', 'humaneval_pass@1'], |
| 81 | + ['GPQA_diamond', 'accuracy'], |
| 82 | + ['IFEval', 'Prompt-level-strict-accuracy'], |
| 83 | + '', |
| 84 | + |
| 85 | + ['mmlu', 'accuracy'], |
| 86 | + ['mmlu-stem', 'accuracy'], |
| 87 | + ['mmlu-social-science', 'accuracy'], |
| 88 | + ['mmlu-humanities', 'accuracy'], |
| 89 | + ['mmlu-other', 'accuracy'], |
| 90 | + |
| 91 | + '', |
| 92 | + ['mmlu_pro', 'accuracy'], |
| 93 | + ['mmlu_pro_math','accuracy'], |
| 94 | + ['mmlu_pro_physics', 'accuracy'], |
| 95 | + ['mmlu_pro_chemistry', 'accuracy'], |
| 96 | + ['mmlu_pro_law', 'accuracy'], |
| 97 | + ['mmlu_pro_engineering', 'accuracy'], |
| 98 | + ['mmlu_pro_other', 'accuracy'], |
| 99 | + ['mmlu_pro_economics', 'accuracy'], |
| 100 | + ['mmlu_pro_health', 'accuracy'], |
| 101 | + ['mmlu_pro_psychology', 'accuracy'], |
| 102 | + ['mmlu_pro_business', 'accuracy'], |
| 103 | + ['mmlu_pro_biology', 'accuracy'], |
| 104 | + ['mmlu_pro_philosophy', 'accuracy'], |
| 105 | + ['mmlu_pro_computer_science','accuracy'], |
| 106 | + ['mmlu_pro_history', 'accuracy'], |
| 107 | + '', |
| 108 | + ['cmmlu', 'accuracy'], |
| 109 | + ['cmmlu-stem', 'accuracy'], |
| 110 | + ['cmmlu-social-science', 'accuracy'], |
| 111 | + ['cmmlu-humanities', 'accuracy'], |
| 112 | + ['cmmlu-other', 'accuracy'], |
| 113 | + ['cmmlu-china-specific', 'accuracy'], |
| 114 | + '', |
| 115 | + ['bbh', 'extract_rate'], |
| 116 | + ['math', 'extract_rate'], |
| 117 | + # ['openai_humaneval', 'extract_rate'], |
| 118 | + ['GPQA_diamond', 'extract_rate'], |
| 119 | + # ['IFEval', 'extract_rate'], |
| 120 | + '', |
| 121 | + ['mmlu', 'extract_rate'], |
| 122 | + ['mmlu-stem', 'extract_rate'], |
| 123 | + ['mmlu-social-science', 'extract_rate'], |
| 124 | + ['mmlu-humanities', 'extract_rate'], |
| 125 | + ['mmlu-other', 'extract_rate'], |
| 126 | + '', |
| 127 | + ['mmlu_pro', 'extract_rate'], |
| 128 | + ['mmlu_pro_math', 'extract_rate'], |
| 129 | + ['mmlu_pro_physics', 'extract_rate'], |
| 130 | + ['mmlu_pro_chemistry', 'extract_rate'], |
| 131 | + ['mmlu_pro_law', 'extract_rate'], |
| 132 | + ['mmlu_pro_engineering', 'extract_rate'], |
| 133 | + ['mmlu_pro_other', 'extract_rate'], |
| 134 | + ['mmlu_pro_economics', 'extract_rate'], |
| 135 | + ['mmlu_pro_health', 'extract_rate'], |
| 136 | + ['mmlu_pro_psychology', 'extract_rate'], |
| 137 | + ['mmlu_pro_business', 'extract_rate'], |
| 138 | + ['mmlu_pro_biology', 'extract_rate'], |
| 139 | + ['mmlu_pro_philosophy', 'extract_rate'], |
| 140 | + ['mmlu_pro_computer_science', 'extract_rate'], |
| 141 | + ['mmlu_pro_history', 'extract_rate'], |
| 142 | + '', |
| 143 | + ['cmmlu', 'extract_rate'], |
| 144 | + ['cmmlu-stem', 'extract_rate'], |
| 145 | + ['cmmlu-social-science', 'extract_rate'], |
| 146 | + ['cmmlu-humanities', 'extract_rate'], |
| 147 | + ['cmmlu-other', 'extract_rate'], |
| 148 | + ['cmmlu-china-specific', 'extract_rate'], |
| 149 | + |
| 150 | + ], |
| 151 | + summary_groups=sum( |
| 152 | + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), |
| 153 | +) |
| 154 | + |
| 155 | + |
| 156 | +####################################################################### |
| 157 | +# PART 3 Models List # |
| 158 | +####################################################################### |
| 159 | + |
| 160 | +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) |
| 161 | + |
| 162 | + |
| 163 | + |
| 164 | +####################################################################### |
| 165 | +# PART 4 Inference/Evaluation Configuaration # |
| 166 | +####################################################################### |
| 167 | + |
| 168 | +# Local Runner |
| 169 | +infer = dict( |
| 170 | + partitioner=dict( |
| 171 | + type=NumWorkerPartitioner, |
| 172 | + num_worker=8 |
| 173 | + ), |
| 174 | + runner=dict( |
| 175 | + type=LocalRunner, |
| 176 | + max_num_workers=16, |
| 177 | + retry=0, # Modify if needed |
| 178 | + task=dict(type=OpenICLInferTask) |
| 179 | + ), |
| 180 | +) |
| 181 | + |
| 182 | +# eval with local runner |
| 183 | +eval = dict( |
| 184 | + partitioner=dict(type=NaivePartitioner, n=10), |
| 185 | + runner=dict( |
| 186 | + type=LocalRunner, |
| 187 | + max_num_workers=16, |
| 188 | + task=dict(type=OpenICLEvalTask)), |
| 189 | +) |
| 190 | + |
| 191 | + |
| 192 | +####################################################################### |
| 193 | +# PART 5 Utils Configuaration # |
| 194 | +####################################################################### |
| 195 | +base_exp_dir = 'outputs/corebench_v1_9/' |
| 196 | +work_dir = osp.join(base_exp_dir, 'chat_objective') |
0 commit comments