diff --git a/.github/scripts/eval_regression_api.py b/.github/scripts/eval_regression_api.py index 29ec54061..ba1902a9c 100644 --- a/.github/scripts/eval_regression_api.py +++ b/.github/scripts/eval_regression_api.py @@ -24,7 +24,7 @@ abbr='lmdeploy-api-test', type=OpenAISDK, key='EMPTY', - openai_api_base='http://localhost:23333/v1', + openai_api_base='http://0.0.0.0:23333/v1', path='internlm2', tokenizer_path='internlm/internlm2_5-7b-chat', rpm_verbose=True, diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index 5b0dee2ba..9f171a02d 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -42,7 +42,7 @@ internlm2_5-7b-chat-hf_fullbench: alpaca_eval_total: 20 arenahard_score: 50 Followbench_naive_average: 1 - CompassArena_naive_average: 44.00 + CompassArena_naive_average: 43 mtbench101_avg: 7.8 wildbench_average: -12.78 simpleqa_accuracy_given_attempted: 0 @@ -58,7 +58,7 @@ internlm2_5-7b-chat-hf_fullbench: alpaca_eval_helpful_base: 20 compassarena_language_naive_average: 35 compassarena_knowledge_naive_average: 55 - compassarena_reason_v2_naive_average: 45.00 + compassarena_reason_v2_naive_average: 40 compassarena_math_v2_naive_average: 55 compassarena_creationv2_zh_naive_average: 30 followbench_llmeval_en_HSR_AVG: 1 diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml index 5f1121a74..45f741316 100644 --- a/.github/scripts/oc_score_baseline_testrange.yaml +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -6,7 +6,7 @@ chat: gsm8k_accuracy: 71.88 race-high_accuracy: 90.62 glm-4-9b-chat-vllm: - gsm8k_accuracy: 65.62 + gsm8k_accuracy: 71.88 race-high_accuracy: 90.62 deepseek-7b-chat-hf: gsm8k_accuracy: 46.88 @@ -63,7 +63,7 @@ chat: gsm8k_accuracy: 84.38 race-high_accuracy: 90.62 llama-3_2-3b-instruct-hf: - gsm8k_accuracy: 65.62 + gsm8k_accuracy: 68.75 race-high_accuracy: 81.25 llama-3-8b-instruct-hf: gsm8k_accuracy: 68.75 @@ -75,7 +75,7 @@ chat: gsm8k_accuracy: 78.12 race-high_accuracy: 90.62 llama-3_2-3b-instruct-turbomind: - gsm8k_accuracy: 62.50 + gsm8k_accuracy: 65.62 race-high_accuracy: 81.25 llama-3-8b-instruct-turbomind: gsm8k_accuracy: 71.88 @@ -226,25 +226,25 @@ base: race-high_accuracy: 25 winogrande_accuracy: 68.75 gemma2-2b-hf: - gsm8k_accuracy: 28.12 + gsm8k_accuracy: 31.25 GPQA_diamond_accuracy: 3.12 race-high_accuracy: 56.25 - winogrande_accuracy: 71.88 + winogrande_accuracy: 75.00 gemma2-9b-hf: - gsm8k_accuracy: 68.75 + gsm8k_accuracy: 75.00 GPQA_diamond_accuracy: 0 - race-high_accuracy: 81.25 - winogrande_accuracy: 84.38 + race-high_accuracy: 84.38 + winogrande_accuracy: 81.25 gemma-2b-hf: - gsm8k_accuracy: 18.75 + gsm8k_accuracy: 21.88 GPQA_diamond_accuracy: 3.12 - race-high_accuracy: 25 + race-high_accuracy: 21.88 winogrande_accuracy: 53.12 gemma-7b-hf: gsm8k_accuracy: 56.25 - GPQA_diamond_accuracy: 6.25 + GPQA_diamond_accuracy: 3.12 race-high_accuracy: 65.62 - winogrande_accuracy: 78.12 + winogrande_accuracy: 71.88 gemma-2b-vllm: gsm8k_accuracy: 15.62 GPQA_diamond_accuracy: 3.12 @@ -441,10 +441,10 @@ base: race-high_accuracy: 93.75 winogrande_accuracy: 87.5 deepseek-v2-turbomind: - gsm8k_accuracy: 71.88 - GPQA_diamond_accuracy: 3.12 - race-high_accuracy: 81.25 - winogrande_accuracy: 75 + gsm8k_accuracy: 65.62 + GPQA_diamond_accuracy: 15.62 + race-high_accuracy: 93.75 + winogrande_accuracy: 84.38 llama-3-70b-hf: gsm8k_accuracy: 62.5 GPQA_diamond_accuracy: 3.12 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 8aa1df16a..3cdb3a732 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -44,7 +44,7 @@ on: type: string default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']" schedule: - - cron: '15 14 * * *' + - cron: '15 14 * * 0,2' env: HF_DATASETS_OFFLINE: 1 @@ -87,7 +87,7 @@ jobs: name: my-artifact-${{ github.run_id }} build-pypi-lmdeploy: - if: ${{!cancelled() && (github.event_name != 'schedule' && inputs.build_lmdeploy)}} + if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.build_lmdeploy)}} strategy: matrix: pyver: [py310] @@ -127,7 +127,7 @@ jobs: needs: ['build-pypi', 'build-pypi-lmdeploy'] runs-on: volc_cu12 environment: 'prod' - timeout-minutes: 240 #4hours + timeout-minutes: 120 #2hours steps: - name: Clone repository uses: actions/checkout@v2 @@ -148,7 +148,7 @@ jobs: uses: nick-fields/retry@v3 with: max_attempts: 1 - timeout_minutes: 240 + timeout_minutes: 120 command: | . ${{env.CONDA_PATH}}/bin/activate conda create -y --name ${{env.CONDA_ENV}} python=3.10 @@ -211,7 +211,7 @@ jobs: uses: nick-fields/retry@v3 with: max_attempts: 1 - timeout_minutes: 120 + timeout_minutes: 180 command: | . ${{env.CONDA_PATH}}/bin/activate conda activate ${{env.CONDA_ENV}} @@ -230,7 +230,7 @@ jobs: regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}} runs-on: volc_cu12_local environment: 'prod' - timeout-minutes: 240 #4hours + timeout-minutes: 480 #6hours steps: - name: Clone repository uses: actions/checkout@v2 @@ -306,7 +306,7 @@ jobs: function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}} runs-on: volc_cu12 environment: 'prod' - timeout-minutes: 360 #6hours + timeout-minutes: 480 #6hours steps: - name: Clone repository uses: actions/checkout@v2 @@ -323,7 +323,7 @@ jobs: uses: nick-fields/retry@v3 with: max_attempts: 1 - timeout_minutes: 360 + timeout_minutes: 480 command: | . ${{env.CONDA_PATH}}/bin/activate conda activate ${{env.CONDA_ENV}}