|
1 | 1 | # MBPP
|
2 | 2 |
|
| 3 | +```bash |
| 4 | +python3 run.py --models hf_internlm2_7b --datasets sanitized_mbpp_gen_742f0c --debug |
| 5 | +python3 run.py --models hf_internlm2_chat_7b --datasets sanitized_mbpp_mdblock_gen_a447ff --debug |
| 6 | +``` |
| 7 | + |
3 | 8 | ## Base Models
|
4 | 9 |
|
5 |
| -| model | mbpp/pass@1 | mbpp/pass | mbpp/timeout | mbpp/failed | mbpp/wrong_answer | |
6 |
| -|:------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:| |
7 |
| -| llama-7b-turbomind | 25.29 | 65 | 8 | 62 | 122 | |
8 |
| -| llama-13b-turbomind | 29.96 | 77 | 4 | 74 | 102 | |
9 |
| -| llama-30b-turbomind | 37.35 | 96 | 17 | 39 | 105 | |
10 |
| -| llama-65b-turbomind | 45.53 | 117 | 10 | 35 | 95 | |
11 |
| -| llama-2-7b-turbomind | 26.46 | 68 | 18 | 49 | 122 | |
12 |
| -| llama-2-13b-turbomind | 36.58 | 94 | 17 | 45 | 101 | |
13 |
| -| llama-2-70b-turbomind | 49.42 | 127 | 12 | 32 | 86 | |
14 |
| -| llama-3-8b-turbomind | 54.86 | 141 | 11 | 22 | 83 | |
15 |
| -| llama-3-70b-turbomind | 77.82 | 200 | 0 | 10 | 47 | |
16 |
| -| internlm2-1.8b-turbomind | 30.74 | 79 | 10 | 61 | 107 | |
17 |
| -| internlm2-7b-turbomind | 54.47 | 140 | 11 | 28 | 78 | |
18 |
| -| internlm2-20b-turbomind | 59.92 | 154 | 6 | 31 | 66 | |
19 |
| -| qwen-1.8b-turbomind | 2.72 | 7 | 16 | 222 | 12 | |
20 |
| -| qwen-7b-turbomind | 46.69 | 120 | 10 | 37 | 90 | |
21 |
| -| qwen-14b-turbomind | 55.64 | 143 | 0 | 31 | 83 | |
22 |
| -| qwen-72b-turbomind | 65.76 | 169 | 0 | 26 | 62 | |
23 |
| -| qwen1.5-0.5b-hf | 5.06 | 13 | 13 | 190 | 41 | |
24 |
| -| qwen1.5-1.8b-hf | 15.95 | 41 | 19 | 124 | 73 | |
25 |
| -| qwen1.5-4b-hf | 45.91 | 118 | 8 | 27 | 104 | |
26 |
| -| qwen1.5-7b-hf | 52.14 | 134 | 11 | 24 | 88 | |
27 |
| -| qwen1.5-14b-hf | 52.14 | 134 | 16 | 33 | 74 | |
28 |
| -| qwen1.5-32b-hf | 59.14 | 152 | 7 | 25 | 73 | |
29 |
| -| qwen1.5-72b-hf | 61.09 | 157 | 1 | 21 | 78 | |
30 |
| -| qwen1.5-moe-a2-7b-hf | 47.08 | 121 | 0 | 52 | 84 | |
31 |
| -| mistral-7b-v0.1-hf | 47.47 | 122 | 9 | 33 | 93 | |
32 |
| -| mistral-7b-v0.2-hf | 49.81 | 128 | 9 | 27 | 93 | |
33 |
| -| mixtral-8x7b-v0.1-hf | 62.65 | 161 | 10 | 13 | 73 | |
34 |
| -| mixtral-8x22b-v0.1-hf | 73.15 | 188 | 1 | 10 | 58 | |
35 |
| -| yi-6b-hf | 30.35 | 78 | 8 | 40 | 131 | |
36 |
| -| yi-34b-hf | 48.64 | 125 | 0 | 43 | 89 | |
37 |
| -| deepseek-7b-base-hf | 43.97 | 113 | 11 | 34 | 99 | |
38 |
| -| deepseek-67b-base-hf | 64.98 | 167 | 0 | 24 | 66 | |
| 10 | +| model | pass@1 | pass | timeout | failed | wrong_answer | |
| 11 | +|:------------------------:|---------:|-------:|----------:|---------:|---------------:| |
| 12 | +| llama-7b-turbomind | 25.29 | 65 | 8 | 62 | 122 | |
| 13 | +| llama-13b-turbomind | 29.96 | 77 | 4 | 74 | 102 | |
| 14 | +| llama-30b-turbomind | 37.35 | 96 | 17 | 39 | 105 | |
| 15 | +| llama-65b-turbomind | 45.53 | 117 | 10 | 35 | 95 | |
| 16 | +| llama-2-7b-turbomind | 26.46 | 68 | 18 | 49 | 122 | |
| 17 | +| llama-2-13b-turbomind | 36.58 | 94 | 17 | 45 | 101 | |
| 18 | +| llama-2-70b-turbomind | 49.42 | 127 | 12 | 32 | 86 | |
| 19 | +| llama-3-8b-turbomind | 54.86 | 141 | 11 | 22 | 83 | |
| 20 | +| llama-3-70b-turbomind | 77.82 | 200 | 0 | 10 | 47 | |
| 21 | +| internlm2-1.8b-turbomind | 30.74 | 79 | 10 | 61 | 107 | |
| 22 | +| internlm2-7b-turbomind | 54.47 | 140 | 11 | 28 | 78 | |
| 23 | +| internlm2-20b-turbomind | 59.92 | 154 | 6 | 31 | 66 | |
| 24 | +| qwen-1.8b-turbomind | 2.72 | 7 | 16 | 222 | 12 | |
| 25 | +| qwen-7b-turbomind | 46.69 | 120 | 10 | 37 | 90 | |
| 26 | +| qwen-14b-turbomind | 55.64 | 143 | 0 | 31 | 83 | |
| 27 | +| qwen-72b-turbomind | 65.76 | 169 | 0 | 26 | 62 | |
| 28 | +| qwen1.5-0.5b-hf | 5.06 | 13 | 13 | 190 | 41 | |
| 29 | +| qwen1.5-1.8b-hf | 15.95 | 41 | 19 | 124 | 73 | |
| 30 | +| qwen1.5-4b-hf | 45.91 | 118 | 8 | 27 | 104 | |
| 31 | +| qwen1.5-7b-hf | 52.14 | 134 | 11 | 24 | 88 | |
| 32 | +| qwen1.5-14b-hf | 52.14 | 134 | 16 | 33 | 74 | |
| 33 | +| qwen1.5-32b-hf | 59.14 | 152 | 7 | 25 | 73 | |
| 34 | +| qwen1.5-72b-hf | 61.09 | 157 | 1 | 21 | 78 | |
| 35 | +| qwen1.5-moe-a2-7b-hf | 47.08 | 121 | 0 | 52 | 84 | |
| 36 | +| mistral-7b-v0.1-hf | 47.47 | 122 | 9 | 33 | 93 | |
| 37 | +| mistral-7b-v0.2-hf | 49.81 | 128 | 9 | 27 | 93 | |
| 38 | +| mixtral-8x7b-v0.1-hf | 62.65 | 161 | 10 | 13 | 73 | |
| 39 | +| mixtral-8x22b-v0.1-hf | 73.15 | 188 | 1 | 10 | 58 | |
| 40 | +| yi-6b-hf | 30.35 | 78 | 8 | 40 | 131 | |
| 41 | +| yi-34b-hf | 48.64 | 125 | 0 | 43 | 89 | |
| 42 | +| deepseek-7b-base-hf | 43.97 | 113 | 11 | 34 | 99 | |
| 43 | +| deepseek-67b-base-hf | 64.98 | 167 | 0 | 24 | 66 | |
39 | 44 |
|
40 | 45 | ## Chat Models
|
41 | 46 |
|
42 |
| -| model | mbpp/pass@1 | mbpp/pass | mbpp/timeout | mbpp/failed | mbpp/wrong_answer | |
43 |
| -|:-----------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:| |
44 |
| -| qwen1.5-0.5b-chat-hf | 11.28 | 29 | 1 | 129 | 98 | |
45 |
| -| qwen1.5-1.8b-chat-hf | 22.57 | 58 | 2 | 70 | 127 | |
46 |
| -| qwen1.5-4b-chat-hf | 43.58 | 112 | 1 | 33 | 111 | |
47 |
| -| qwen1.5-7b-chat-hf | 50.58 | 130 | 0 | 35 | 92 | |
48 |
| -| qwen1.5-14b-chat-hf | 56.03 | 144 | 0 | 24 | 89 | |
49 |
| -| qwen1.5-32b-chat-hf | 65.37 | 168 | 2 | 13 | 74 | |
50 |
| -| qwen1.5-72b-chat-hf | 66.93 | 172 | 0 | 17 | 68 | |
51 |
| -| qwen1.5-110b-chat-hf | 68.48 | 176 | 0 | 16 | 65 | |
52 |
| -| internlm2-chat-1.8b-hf | 39.69 | 102 | 0 | 48 | 107 | |
53 |
| -| internlm2-chat-1.8b-sft-hf | 36.19 | 93 | 1 | 58 | 105 | |
54 |
| -| internlm2-chat-7b-hf | 57.59 | 148 | 0 | 21 | 88 | |
55 |
| -| internlm2-chat-7b-sft-hf | 55.64 | 143 | 2 | 22 | 90 | |
56 |
| -| internlm2-chat-20b-hf | 68.87 | 177 | 0 | 16 | 64 | |
57 |
| -| internlm2-chat-20b-sft-hf | 69.65 | 179 | 0 | 16 | 62 | |
58 |
| -| llama-3-8b-instruct-hf | 68.87 | 177 | 0 | 8 | 72 | |
59 |
| -| llama-3-70b-instruct-hf | 79.77 | 205 | 0 | 2 | 50 | |
60 |
| -| llama-3-8b-instruct-lmdeploy | 66.93 | 172 | 0 | 7 | 78 | |
61 |
| -| llama-3-70b-instruct-lmdeploy | 77.82 | 200 | 1 | 2 | 54 | |
62 |
| -| mistral-7b-instruct-v0.1-hf | 47.86 | 123 | 0 | 29 | 105 | |
63 |
| -| mistral-7b-instruct-v0.2-hf | 45.91 | 118 | 0 | 31 | 108 | |
64 |
| -| mixtral-8x7b-instruct-v0.1-hf | 61.48 | 158 | 1 | 13 | 85 | |
| 47 | +| model | pass@1 | pass | timeout | failed | wrong_answer | |
| 48 | +|:-----------------------------:|---------:|-------:|----------:|---------:|---------------:| |
| 49 | +| qwen1.5-0.5b-chat-hf | 11.28 | 29 | 1 | 129 | 98 | |
| 50 | +| qwen1.5-1.8b-chat-hf | 22.57 | 58 | 2 | 70 | 127 | |
| 51 | +| qwen1.5-4b-chat-hf | 43.58 | 112 | 1 | 33 | 111 | |
| 52 | +| qwen1.5-7b-chat-hf | 50.58 | 130 | 0 | 35 | 92 | |
| 53 | +| qwen1.5-14b-chat-hf | 56.03 | 144 | 0 | 24 | 89 | |
| 54 | +| qwen1.5-32b-chat-hf | 65.37 | 168 | 2 | 13 | 74 | |
| 55 | +| qwen1.5-72b-chat-hf | 66.93 | 172 | 0 | 17 | 68 | |
| 56 | +| qwen1.5-110b-chat-hf | 68.48 | 176 | 0 | 16 | 65 | |
| 57 | +| internlm2-chat-1.8b-hf | 39.69 | 102 | 0 | 48 | 107 | |
| 58 | +| internlm2-chat-1.8b-sft-hf | 36.19 | 93 | 1 | 58 | 105 | |
| 59 | +| internlm2-chat-7b-hf | 57.59 | 148 | 0 | 21 | 88 | |
| 60 | +| internlm2-chat-7b-sft-hf | 55.64 | 143 | 2 | 22 | 90 | |
| 61 | +| internlm2-chat-20b-hf | 68.87 | 177 | 0 | 16 | 64 | |
| 62 | +| internlm2-chat-20b-sft-hf | 69.65 | 179 | 0 | 16 | 62 | |
| 63 | +| llama-3-8b-instruct-hf | 68.87 | 177 | 0 | 8 | 72 | |
| 64 | +| llama-3-70b-instruct-hf | 79.77 | 205 | 0 | 2 | 50 | |
| 65 | +| llama-3-8b-instruct-lmdeploy | 66.93 | 172 | 0 | 7 | 78 | |
| 66 | +| llama-3-70b-instruct-lmdeploy | 77.82 | 200 | 1 | 2 | 54 | |
| 67 | +| mistral-7b-instruct-v0.1-hf | 47.86 | 123 | 0 | 29 | 105 | |
| 68 | +| mistral-7b-instruct-v0.2-hf | 45.91 | 118 | 0 | 31 | 108 | |
| 69 | +| mixtral-8x7b-instruct-v0.1-hf | 61.48 | 158 | 1 | 13 | 85 | |
0 commit comments