6
6
7
7
output_path = 'regression_result_daily'
8
8
9
- model_list = ['internlm2-7b-hf' , 'internlm-chat-7b-hf' , 'chatglm3-6b-base-hf' ]
10
- dataset_list = [
11
- 'ARC-c' , 'chid-dev' , 'chid-test' , 'openai_humaneval' , 'openbookqa' ,
12
- 'openbookqa_fact'
9
+ chat_model_list = [
10
+ 'baichuan2-7b-chat-hf' , 'deepseek-7b-chat-hf' , 'deepseek-moe-16b-chat-hf' ,
11
+ 'gemma-2b-it-hf' , 'gemma-7b-it-hf' , 'internlm2-chat-1.8b-turbomind' ,
12
+ 'internlm2-chat-1.8b-sft-turbomind' , 'internlm2-chat-7b-turbomind' ,
13
+ 'internlm2-chat-7b-sft-turbomind' , 'llama-3-8b-instruct-hf' ,
14
+ 'llama-3-8b-instruct-turbomind' , 'mistral-7b-instruct-v0.2-hf' ,
15
+ 'minicpm-2b-dpo-fp32-hf' , 'minicpm-2b-sft-bf16-hf' ,
16
+ 'minicpm-2b-sft-fp32-hf' , 'phi-3-mini-4k-instruct-hf' ,
17
+ 'qwen1.5-0.5b-chat-hf' , 'qwen2-1.5b-instruct-turbomind' ,
18
+ 'qwen2-7b-instruct-turbomind' , 'yi-1.5-6b-chat-hf' , 'yi-1.5-9b-chat-hf'
13
19
]
20
+ base_model_list = [
21
+ 'deepseek-moe-16b-base-hf' , 'deepseek-7b-base-turbomind' , 'gemma-2b-hf' ,
22
+ 'gemma-7b-hf' , 'internlm2-1.8b-turbomind' , 'internlm2-7b-turbomind' ,
23
+ 'internlm2-base-7b-turbomind' , 'llama-3-8b-turbomind' ,
24
+ 'mistral-7b-v0.2-hf' , 'qwen1.5-moe-a2.7b-hf' , 'qwen2-0.5b-hf' ,
25
+ 'qwen2-1.5b-turbomind' , 'qwen2-7b-turbomind' , 'yi-1.5-6b-hf' ,
26
+ 'yi-1.5-9b-hf'
27
+ ]
28
+ dataset_list = ['gsm8k' , 'race-middle' , 'race-high' ]
14
29
15
30
16
31
@pytest .fixture ()
@@ -32,10 +47,28 @@ def result_scores():
32
47
33
48
@pytest .mark .usefixtures ('result_scores' )
34
49
@pytest .mark .usefixtures ('baseline_scores' )
50
+ @pytest .mark .chat
35
51
class TestChat :
36
52
"""Test cases for chat model."""
37
53
38
- @pytest .mark .parametrize ('model, dataset' , [(p1 , p2 ) for p1 in model_list
54
+ @pytest .mark .parametrize ('model, dataset' , [(p1 , p2 )
55
+ for p1 in chat_model_list
56
+ for p2 in dataset_list ])
57
+ def test_model_dataset_score (self , baseline_scores , result_scores , model ,
58
+ dataset ):
59
+ base_score = baseline_scores .get (model ).get (dataset )
60
+ result_score = result_scores .get (model ).get (dataset )
61
+ assert_score (result_score , base_score )
62
+
63
+
64
+ @pytest .mark .usefixtures ('result_scores' )
65
+ @pytest .mark .usefixtures ('baseline_scores' )
66
+ @pytest .mark .base
67
+ class TestBase :
68
+ """Test cases for base model."""
69
+
70
+ @pytest .mark .parametrize ('model, dataset' , [(p1 , p2 )
71
+ for p1 in base_model_list
39
72
for p2 in dataset_list ])
40
73
def test_model_dataset_score (self , baseline_scores , result_scores , model ,
41
74
dataset ):
@@ -47,13 +80,13 @@ def test_model_dataset_score(self, baseline_scores, result_scores, model,
47
80
def assert_score (score , baseline ):
48
81
if score is None or score == '-' :
49
82
assert False , 'value is none'
50
- if float (score ) < (baseline * 1.03 ) and float (score ) > (baseline * 0.97 ):
51
- print (score + ' between ' + str (baseline * 0.97 ) + ' and ' +
52
- str (baseline * 1.03 ))
83
+ if float (score ) <= (baseline + 5 ) and float (score ) >= (baseline - 5 ):
84
+ print (score + ' between ' + str (baseline - 5 ) + ' and ' +
85
+ str (baseline + 5 ))
53
86
assert True
54
87
else :
55
88
assert False , score + ' not between ' + str (
56
- baseline * 0.97 ) + ' and ' + str (baseline * 1.03 )
89
+ baseline - 5 ) + ' and ' + str (baseline + 5 )
57
90
58
91
59
92
def find_csv_files (directory ):
@@ -62,11 +95,11 @@ def find_csv_files(directory):
62
95
for file in files :
63
96
if file .endswith ('.csv' ):
64
97
csv_files .append (os .path .join (root , file ))
65
- if len ( csv_files ) > 1 :
66
- raise 'have more than 1 result file, please check the result manually'
67
- if len ( csv_files ) == 0 :
68
- return None
69
- return csv_files [ 0 ]
98
+
99
+ csv_files_with_time = { f : os . path . getctime ( f ) for f in csv_files }
100
+ sorted_csv_files = sorted ( csv_files_with_time . items (), key = lambda x : x [ 1 ])
101
+ latest_csv_file = sorted_csv_files [ - 1 ][ 0 ]
102
+ return latest_csv_file
70
103
71
104
72
105
def read_csv_file (file_path ):
0 commit comments