[Doc] Update running command in README (#1206)

Fengzhe Zhou · web-flow · commit d59189b87fe6 · 2024-05-30T00:06:39.000+08:00
diff --git a/configs/datasets/GaokaoBench/README.md b/configs/datasets/GaokaoBench/README.md
@@ -1,5 +1,10 @@
 # GaokaoBench
 
+```bash
+python3 run.py --models hf_internlm2_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets GaokaoBench_no_subjective_gen_4c31db --debug
+```
+
 ## Base Models
 
 |          model           |   GaokaoBench |
diff --git a/configs/datasets/IFEval/README.md b/configs/datasets/IFEval/README.md
@@ -0,0 +1,31 @@
+# IFEval
+
+```bash
+python3 run.py --models hf_internlm2_chat_7b --datasets IFEval_gen_3321a3 --debug
+```
+
+## Chat Models
+
+|             model             |   Prompt-level-strict-accuracy |   Inst-level-strict-accuracy |   Prompt-level-loose-accuracy |   Inst-level-loose-accuracy |
+|:-----------------------------:|-------------------------------:|-----------------------------:|------------------------------:|----------------------------:|
+|     qwen1.5-0.5b-chat-hf      |                          13.12 |                        23.26 |                         15.71 |                       26.38 |
+|     qwen1.5-1.8b-chat-hf      |                          16.08 |                        26.26 |                         18.30 |                       29.02 |
+|      qwen1.5-4b-chat-hf       |                          25.51 |                        35.97 |                         28.84 |                       39.81 |
+|      qwen1.5-7b-chat-hf       |                          38.82 |                        50.00 |                         42.70 |                       53.48 |
+|      qwen1.5-14b-chat-hf      |                          42.51 |                        54.20 |                         49.17 |                       59.95 |
+|      qwen1.5-32b-chat-hf      |                          49.54 |                        60.43 |                         53.97 |                       64.39 |
+|      qwen1.5-72b-chat-hf      |                          51.02 |                        61.99 |                         57.12 |                       67.27 |
+|     qwen1.5-110b-chat-hf      |                          55.08 |                        65.59 |                         61.18 |                       70.86 |
+|    internlm2-chat-1.8b-hf     |                          18.30 |                        28.78 |                         21.44 |                       32.01 |
+|  internlm2-chat-1.8b-sft-hf   |                          18.67 |                        31.18 |                         19.78 |                       32.85 |
+|     internlm2-chat-7b-hf      |                          34.75 |                        46.28 |                         40.48 |                       51.44 |
+|   internlm2-chat-7b-sft-hf    |                          39.19 |                        50.12 |                         42.33 |                       52.76 |
+|     internlm2-chat-20b-hf     |                          36.41 |                        48.68 |                         40.67 |                       53.24 |
+|   internlm2-chat-20b-sft-hf   |                          44.55 |                        55.64 |                         46.77 |                       58.03 |
+|    llama-3-8b-instruct-hf     |                          68.02 |                        76.74 |                         75.42 |                       82.85 |
+|    llama-3-70b-instruct-hf    |                          78.00 |                        84.65 |                         84.29 |                       89.21 |
+| llama-3-8b-instruct-lmdeploy  |                          69.13 |                        77.46 |                         77.26 |                       83.93 |
+| llama-3-70b-instruct-lmdeploy |                          75.97 |                        82.97 |                         83.18 |                       88.37 |
+|  mistral-7b-instruct-v0.1-hf  |                          40.30 |                        50.96 |                         41.96 |                       53.48 |
+|  mistral-7b-instruct-v0.2-hf  |                          49.17 |                        60.43 |                         51.94 |                       64.03 |
+| mixtral-8x7b-instruct-v0.1-hf |                          50.09 |                        60.67 |                         55.64 |                       65.83 |
diff --git a/configs/datasets/TheoremQA/README.md b/configs/datasets/TheoremQA/README.md
@@ -1,5 +1,10 @@
 # TheoremQA
 
+```bash
+python3 run.py --models hf_internlm2_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
+```
+
 ## Base Models
 
 |          model           |   TheoremQA |
diff --git a/configs/datasets/bbh/README.md b/configs/datasets/bbh/README.md
@@ -1,5 +1,10 @@
 # BBH
 
+```bash
+python3 run.py --models hf_internlm2_7b --datasets bbh_gen_98fba6 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets bbh_gen_5b92b0 --debug
+```
+
 ## Base Models
 
 |          model           |   bbh |
diff --git a/configs/datasets/ceval/README.md b/configs/datasets/ceval/README.md
@@ -1,5 +1,10 @@
 # C-Eval
 
+```bash
+python3 run.py --models hf_internlm2_7b --datasets ceval_internal_ppl_93e5ce --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets ceval_internal_gen_2daf24 --debug
+```
+
 ## Base Models
 
 |          model           |   ceval-test |   ceval-test-hard |   ceval-test-stem |   ceval-test-social-science |   ceval-test-humanities |   ceval-test-other |   ceval-dev |   ceval-dev-hard |   ceval-dev-stem |   ceval-dev-social-science |   ceval-dev-humanities |   ceval-dev-other |
diff --git a/configs/datasets/gpqa/README.md b/configs/datasets/gpqa/README.md
@@ -1,5 +1,10 @@
 # GPQA
 
+```bash
+python3 run.py --models hf_internlm2_7b --datasets gpqa_ppl_6bf57a --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets gpqa_gen_4baadb --debug
+```
+
 ## Base Models
 
 |          model           |   GPQA_diamond |
diff --git a/configs/datasets/gsm8k/README.md b/configs/datasets/gsm8k/README.md
@@ -1,5 +1,10 @@
 # GSM8K
 
+```bash
+python3 run.py --models hf_internlm2_7b --datasets gsm8k_gen_17d0dc --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets gsm8k_gen_1d7fe4 --debug
+```
+
 ## Base Models
 
 |          model           |   gsm8k |
diff --git a/configs/datasets/hellaswag/README.md b/configs/datasets/hellaswag/README.md
@@ -1,5 +1,10 @@
 # HellaSwag
 
+```bash
+python3 run.py --models hf_internlm2_7b --datasets hellaswag_10shot_ppl_59c85e --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets hellaswag_10shot_gen_e42710 --debug
+```
+
 ## Base Models
 
 |          model           |   hellaswag |
diff --git a/configs/datasets/humaneval/README.md b/configs/datasets/humaneval/README.md
@@ -1,5 +1,10 @@
 # HumanEval
 
+```bash
+python3 run.py --models hf_internlm2_7b --datasets humaneval_gen_d2537e --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
+```
+
 ## Base Models
 
 |          model           |   pass@1 |
diff --git a/configs/datasets/math/README.md b/configs/datasets/math/README.md
@@ -1,5 +1,10 @@
 # MATH
 
+```bash
+python3 run.py --models hf_internlm2_7b --datasets math_4shot_base_gen_db136b --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets math_0shot_gen_393424 --debug
+```
+
 ## Base Models
 
 |          model           |   math |
diff --git a/configs/datasets/mbpp/README.md b/configs/datasets/mbpp/README.md
@@ -1,64 +1,69 @@
 # MBPP
 
+```bash
+python3 run.py --models hf_internlm2_7b --datasets sanitized_mbpp_gen_742f0c --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets sanitized_mbpp_mdblock_gen_a447ff --debug
+```
+
 ## Base Models
 
-|          model           |   mbpp/pass@1 |   mbpp/pass |   mbpp/timeout |   mbpp/failed |   mbpp/wrong_answer |
-|:------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:|
-|    llama-7b-turbomind    |         25.29 |          65 |              8 |            62 |                 122 |
-|   llama-13b-turbomind    |         29.96 |          77 |              4 |            74 |                 102 |
-|   llama-30b-turbomind    |         37.35 |          96 |             17 |            39 |                 105 |
-|   llama-65b-turbomind    |         45.53 |         117 |             10 |            35 |                  95 |
-|   llama-2-7b-turbomind   |         26.46 |          68 |             18 |            49 |                 122 |
-|  llama-2-13b-turbomind   |         36.58 |          94 |             17 |            45 |                 101 |
-|  llama-2-70b-turbomind   |         49.42 |         127 |             12 |            32 |                  86 |
-|   llama-3-8b-turbomind   |         54.86 |         141 |             11 |            22 |                  83 |
-|  llama-3-70b-turbomind   |         77.82 |         200 |              0 |            10 |                  47 |
-| internlm2-1.8b-turbomind |         30.74 |          79 |             10 |            61 |                 107 |
-|  internlm2-7b-turbomind  |         54.47 |         140 |             11 |            28 |                  78 |
-| internlm2-20b-turbomind  |         59.92 |         154 |              6 |            31 |                  66 |
-|   qwen-1.8b-turbomind    |          2.72 |           7 |             16 |           222 |                  12 |
-|    qwen-7b-turbomind     |         46.69 |         120 |             10 |            37 |                  90 |
-|    qwen-14b-turbomind    |         55.64 |         143 |              0 |            31 |                  83 |
-|    qwen-72b-turbomind    |         65.76 |         169 |              0 |            26 |                  62 |
-|     qwen1.5-0.5b-hf      |          5.06 |          13 |             13 |           190 |                  41 |
-|     qwen1.5-1.8b-hf      |         15.95 |          41 |             19 |           124 |                  73 |
-|      qwen1.5-4b-hf       |         45.91 |         118 |              8 |            27 |                 104 |
-|      qwen1.5-7b-hf       |         52.14 |         134 |             11 |            24 |                  88 |
-|      qwen1.5-14b-hf      |         52.14 |         134 |             16 |            33 |                  74 |
-|      qwen1.5-32b-hf      |         59.14 |         152 |              7 |            25 |                  73 |
-|      qwen1.5-72b-hf      |         61.09 |         157 |              1 |            21 |                  78 |
-|   qwen1.5-moe-a2-7b-hf   |         47.08 |         121 |              0 |            52 |                  84 |
-|    mistral-7b-v0.1-hf    |         47.47 |         122 |              9 |            33 |                  93 |
-|    mistral-7b-v0.2-hf    |         49.81 |         128 |              9 |            27 |                  93 |
-|   mixtral-8x7b-v0.1-hf   |         62.65 |         161 |             10 |            13 |                  73 |
-|  mixtral-8x22b-v0.1-hf   |         73.15 |         188 |              1 |            10 |                  58 |
-|         yi-6b-hf         |         30.35 |          78 |              8 |            40 |                 131 |
-|        yi-34b-hf         |         48.64 |         125 |              0 |            43 |                  89 |
-|   deepseek-7b-base-hf    |         43.97 |         113 |             11 |            34 |                  99 |
-|   deepseek-67b-base-hf   |         64.98 |         167 |              0 |            24 |                  66 |
+|          model           |   pass@1 |   pass |   timeout |   failed |   wrong_answer |
+|:------------------------:|---------:|-------:|----------:|---------:|---------------:|
+|    llama-7b-turbomind    |    25.29 |     65 |         8 |       62 |            122 |
+|   llama-13b-turbomind    |    29.96 |     77 |         4 |       74 |            102 |
+|   llama-30b-turbomind    |    37.35 |     96 |        17 |       39 |            105 |
+|   llama-65b-turbomind    |    45.53 |    117 |        10 |       35 |             95 |
+|   llama-2-7b-turbomind   |    26.46 |     68 |        18 |       49 |            122 |
+|  llama-2-13b-turbomind   |    36.58 |     94 |        17 |       45 |            101 |
+|  llama-2-70b-turbomind   |    49.42 |    127 |        12 |       32 |             86 |
+|   llama-3-8b-turbomind   |    54.86 |    141 |        11 |       22 |             83 |
+|  llama-3-70b-turbomind   |    77.82 |    200 |         0 |       10 |             47 |
+| internlm2-1.8b-turbomind |    30.74 |     79 |        10 |       61 |            107 |
+|  internlm2-7b-turbomind  |    54.47 |    140 |        11 |       28 |             78 |
+| internlm2-20b-turbomind  |    59.92 |    154 |         6 |       31 |             66 |
+|   qwen-1.8b-turbomind    |     2.72 |      7 |        16 |      222 |             12 |
+|    qwen-7b-turbomind     |    46.69 |    120 |        10 |       37 |             90 |
+|    qwen-14b-turbomind    |    55.64 |    143 |         0 |       31 |             83 |
+|    qwen-72b-turbomind    |    65.76 |    169 |         0 |       26 |             62 |
+|     qwen1.5-0.5b-hf      |     5.06 |     13 |        13 |      190 |             41 |
+|     qwen1.5-1.8b-hf      |    15.95 |     41 |        19 |      124 |             73 |
+|      qwen1.5-4b-hf       |    45.91 |    118 |         8 |       27 |            104 |
+|      qwen1.5-7b-hf       |    52.14 |    134 |        11 |       24 |             88 |
+|      qwen1.5-14b-hf      |    52.14 |    134 |        16 |       33 |             74 |
+|      qwen1.5-32b-hf      |    59.14 |    152 |         7 |       25 |             73 |
+|      qwen1.5-72b-hf      |    61.09 |    157 |         1 |       21 |             78 |
+|   qwen1.5-moe-a2-7b-hf   |    47.08 |    121 |         0 |       52 |             84 |
+|    mistral-7b-v0.1-hf    |    47.47 |    122 |         9 |       33 |             93 |
+|    mistral-7b-v0.2-hf    |    49.81 |    128 |         9 |       27 |             93 |
+|   mixtral-8x7b-v0.1-hf   |    62.65 |    161 |        10 |       13 |             73 |
+|  mixtral-8x22b-v0.1-hf   |    73.15 |    188 |         1 |       10 |             58 |
+|         yi-6b-hf         |    30.35 |     78 |         8 |       40 |            131 |
+|        yi-34b-hf         |    48.64 |    125 |         0 |       43 |             89 |
+|   deepseek-7b-base-hf    |    43.97 |    113 |        11 |       34 |             99 |
+|   deepseek-67b-base-hf   |    64.98 |    167 |         0 |       24 |             66 |
 
 ## Chat Models
 
-|             model             |   mbpp/pass@1 |   mbpp/pass |   mbpp/timeout |   mbpp/failed |   mbpp/wrong_answer |
-|:-----------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:|
-|     qwen1.5-0.5b-chat-hf      |         11.28 |          29 |              1 |           129 |                  98 |
-|     qwen1.5-1.8b-chat-hf      |         22.57 |          58 |              2 |            70 |                 127 |
-|      qwen1.5-4b-chat-hf       |         43.58 |         112 |              1 |            33 |                 111 |
-|      qwen1.5-7b-chat-hf       |         50.58 |         130 |              0 |            35 |                  92 |
-|      qwen1.5-14b-chat-hf      |         56.03 |         144 |              0 |            24 |                  89 |
-|      qwen1.5-32b-chat-hf      |         65.37 |         168 |              2 |            13 |                  74 |
-|      qwen1.5-72b-chat-hf      |         66.93 |         172 |              0 |            17 |                  68 |
-|     qwen1.5-110b-chat-hf      |         68.48 |         176 |              0 |            16 |                  65 |
-|    internlm2-chat-1.8b-hf     |         39.69 |         102 |              0 |            48 |                 107 |
-|  internlm2-chat-1.8b-sft-hf   |         36.19 |          93 |              1 |            58 |                 105 |
-|     internlm2-chat-7b-hf      |         57.59 |         148 |              0 |            21 |                  88 |
-|   internlm2-chat-7b-sft-hf    |         55.64 |         143 |              2 |            22 |                  90 |
-|     internlm2-chat-20b-hf     |         68.87 |         177 |              0 |            16 |                  64 |
-|   internlm2-chat-20b-sft-hf   |         69.65 |         179 |              0 |            16 |                  62 |
-|    llama-3-8b-instruct-hf     |         68.87 |         177 |              0 |             8 |                  72 |
-|    llama-3-70b-instruct-hf    |         79.77 |         205 |              0 |             2 |                  50 |
-| llama-3-8b-instruct-lmdeploy  |         66.93 |         172 |              0 |             7 |                  78 |
-| llama-3-70b-instruct-lmdeploy |         77.82 |         200 |              1 |             2 |                  54 |
-|  mistral-7b-instruct-v0.1-hf  |         47.86 |         123 |              0 |            29 |                 105 |
-|  mistral-7b-instruct-v0.2-hf  |         45.91 |         118 |              0 |            31 |                 108 |
-| mixtral-8x7b-instruct-v0.1-hf |         61.48 |         158 |              1 |            13 |                  85 |
+|             model             |   pass@1 |   pass |   timeout |   failed |   wrong_answer |
+|:-----------------------------:|---------:|-------:|----------:|---------:|---------------:|
+|     qwen1.5-0.5b-chat-hf      |    11.28 |     29 |         1 |      129 |             98 |
+|     qwen1.5-1.8b-chat-hf      |    22.57 |     58 |         2 |       70 |            127 |
+|      qwen1.5-4b-chat-hf       |    43.58 |    112 |         1 |       33 |            111 |
+|      qwen1.5-7b-chat-hf       |    50.58 |    130 |         0 |       35 |             92 |
+|      qwen1.5-14b-chat-hf      |    56.03 |    144 |         0 |       24 |             89 |
+|      qwen1.5-32b-chat-hf      |    65.37 |    168 |         2 |       13 |             74 |
+|      qwen1.5-72b-chat-hf      |    66.93 |    172 |         0 |       17 |             68 |
+|     qwen1.5-110b-chat-hf      |    68.48 |    176 |         0 |       16 |             65 |
+|    internlm2-chat-1.8b-hf     |    39.69 |    102 |         0 |       48 |            107 |
+|  internlm2-chat-1.8b-sft-hf   |    36.19 |     93 |         1 |       58 |            105 |
+|     internlm2-chat-7b-hf      |    57.59 |    148 |         0 |       21 |             88 |
+|   internlm2-chat-7b-sft-hf    |    55.64 |    143 |         2 |       22 |             90 |
+|     internlm2-chat-20b-hf     |    68.87 |    177 |         0 |       16 |             64 |
+|   internlm2-chat-20b-sft-hf   |    69.65 |    179 |         0 |       16 |             62 |
+|    llama-3-8b-instruct-hf     |    68.87 |    177 |         0 |        8 |             72 |
+|    llama-3-70b-instruct-hf    |    79.77 |    205 |         0 |        2 |             50 |
+| llama-3-8b-instruct-lmdeploy  |    66.93 |    172 |         0 |        7 |             78 |
+| llama-3-70b-instruct-lmdeploy |    77.82 |    200 |         1 |        2 |             54 |
+|  mistral-7b-instruct-v0.1-hf  |    47.86 |    123 |         0 |       29 |            105 |
+|  mistral-7b-instruct-v0.2-hf  |    45.91 |    118 |         0 |       31 |            108 |
+| mixtral-8x7b-instruct-v0.1-hf |    61.48 |    158 |         1 |       13 |             85 |
diff --git a/configs/datasets/mmlu/README.md b/configs/datasets/mmlu/README.md
@@ -1,5 +1,10 @@
 # MMLU
 
+```bash
+python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug
+```
+
 ## Base Models
 
 |          model           |   mmlu |   mmlu-stem |   mmlu-social-science |   mmlu-humanities |   mmlu-other |
diff --git a/configs/datasets/nq/README.md b/configs/datasets/nq/README.md
@@ -1,5 +1,10 @@
 # NQ
 
+```bash
+python3 run.py --models hf_internlm2_7b --datasets nq_open_1shot_gen_20a989 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets nq_open_1shot_gen_01cf41 --debug
+```
+
 ## Base Models
 
 |          model           |    nq |
diff --git a/configs/datasets/race/README.md b/configs/datasets/race/README.md
@@ -1,5 +1,10 @@
 # RACE
 
+```bash
+python3 run.py --models hf_internlm2_7b --datasets race_ppl_abed12 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets race_gen_69ee4f --debug
+```
+
 ## Base Models
 
 |          model           |   race-high |   race-middle |
diff --git a/configs/datasets/triviaqa/README.md b/configs/datasets/triviaqa/README.md
@@ -1,5 +1,10 @@
 # TriviaQA
 
+```bash
+python3 run.py --models hf_internlm2_7b --datasets triviaqa_wiki_1shot_gen_20a989 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets triviaqa_wiki_1shot_gen_eaf81e --debug
+```
+
 ## Base Models
 
 |          model           |   triviaqa |
diff --git a/configs/datasets/winogrande/README.md b/configs/datasets/winogrande/README.md
@@ -1,5 +1,10 @@
 # WinoGrande
 
+```bash
+python3 run.py --models hf_internlm2_7b --datasets winogrande_5shot_ll_252f01 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets winogrande_5shot_gen_b36770 --debug
+```
+
 ## Base Models
 
 |          model           |   winogrande |