open-compass · Nov 28, 2023 · Nov 29, 2023 · Nov 30, 2023 · Nov 30, 2023 · Dec 1, 2023
diff --git a/.github/workflows/pr-stage-check.yml b/.github/workflows/pr-stage-check.yml
@@ -0,0 +1,121 @@
+name: pr_stage_test
+
+on:
+  pull_request:
+    paths-ignore:
+      - 'README.md'
+      - 'README_zh-CN.md'
+      - 'docs/**'
+      - 'configs/**'
+      - 'tools/**'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python-version: ['3.10']
+        include:
+          - torch: 2.0.0
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: python -m pip install --upgrade pip
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
+      - name: Install system dependencies
+        run: |
+          sudo sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list
+          sudo apt-get update && sudo apt-get install -y libc6 libffi-dev libncursesw6 wget unzip
+      - name: Upgrade pip
+        run: python -m pip install pip --upgrade
+      - name: Install opencompass dependencies
+        run: |
+          python -m pip install -r requirements.txt
+      - name: Build and install
+        run: python -m pip install -e .
+      - name: Prepare dataset
+        run: |
+          wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip
+          unzip OpenCompassData-core-20231110.zip
+      - name: Dry run test
+        run: |
+          python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
+
+  build_cu117:
+    runs-on: ubuntu-22.04
+    container:
+      image: pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel
+    strategy:
+      matrix:
+        python-version: ['3.10']
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Install Python-dev
+        run: apt-get update && apt-get install -y python${{matrix.python-version}}-dev
+        if: ${{matrix.python-version != 3.10}}
+      - name: Install system dependencies
+        run: |
+          apt-get update
+          apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev libc6 libc6-dev
+          sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list
+          apt-get update && apt-get install -y libc6 libffi-dev libncursesw6 wget unzip
+      - name: Upgrade pip
+        run: python -m pip install pip --upgrade
+      - name: Install opencompass dependencies
+        run: |
+          python -m pip install -r requirements.txt
+      - name: Build and install
+        run: python -m pip install -e .
+      - name: Prepare dataset
+        run: |
+          wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip
+          unzip OpenCompassData-core-20231110.zip
+      - name: Dry run test
+        run: |
+          python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
+
+  build_windows:
+    runs-on: windows-2022
+    strategy:
+      matrix:
+        python-version: ['3.10']
+        platform: [cpu]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: python -m pip install pip --upgrade
+      - name: Install PyTorch
+        run: pip install torch==2.0.0+${{matrix.platform}} -f https://download.pytorch.org/whl/${{matrix.platform}}/torch_stable.html
+      - name: Install opencompass dependencies
+        run: |
+          pip install -r requirements.txt
+      - name: Build and install
+        run: pip install -e .
+      - name: Prepare dataset
+        run: |
+          Invoke-WebRequest -Uri https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip -OutFile OpenCompassData-core-20231110.zip
+          unzip OpenCompassData-core-20231110.zip
+      - name: Dry run test
+        run: |
+          python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ configs/eval_debug*.py
 configs/viz_*.py
 data
 work_dirs
+models
 configs/internal/
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/.pre-commit-config-zh-cn.yaml b/.pre-commit-config-zh-cn.yaml
@@ -5,7 +5,8 @@ exclude: |
       opencompass/utils/internal/|
       opencompass/openicl/icl_evaluator/hf_metrics/|
       opencompass/datasets/lawbench/utils|
-      opencompass/datasets/lawbench/evaluation_functions/
+      opencompass/datasets/lawbench/evaluation_functions/|
+      opencompass/datasets/medbench
     )
 repos:
   - repo: https://gitee.com/openmmlab/mirrors-flake8

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,7 +5,8 @@ exclude: |
       opencompass/utils/internal/|
       opencompass/openicl/icl_evaluator/hf_metrics/|
       opencompass/datasets/lawbench/utils|
-      opencompass/datasets/lawbench/evaluation_functions/
+      opencompass/datasets/lawbench/evaluation_functions/|
+      opencompass/datasets/medbench/
     )
 repos:
   - repo: https://github.com/PyCQA/flake8

diff --git a/README.md b/README.md
@@ -50,6 +50,8 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
 
 ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 
+- **\[2023.12.10\]** We have released [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), a toolkit for evaluating vision-language models (VLMs), currently support 20+ VLMs and 7 multi-modal benchmarks (including MMBench series). 🔥🔥🔥.
+- **\[2023.12.10\]** We have supported Mistral AI's MoE LLM: **Mixtral-8x7B-32K**. Welcome to [MixtralKit](https://github.com/open-compass/MixtralKit) for more details about inference and evaluation. 🔥🔥🔥.
 - **\[2023.11.22\]** We have supported many API-based models, include **Baidu, ByteDance, Huawei, 360**. Welcome to [Models](https://opencompass.readthedocs.io/en/latest/user_guides/models.html) section for more details. 🔥🔥🔥.
 - **\[2023.11.20\]** Thanks [helloyongyang](https://github.com/helloyongyang) for supporting the evaluation with [LightLLM](https://github.com/ModelTC/lightllm) as backent. Welcome to [Evaluation With LightLLM](https://opencompass.readthedocs.io/en/latest/advanced_guides/evaluation_lightllm.html) for more details. 🔥🔥🔥.
 - **\[2023.11.13\]** We are delighted to announce the release of OpenCompass v0.1.8. This version enables local loading of evaluation benchmarks, thereby eliminating the need for an internet connection. Please note that with this update, **you must re-download all evaluation datasets** to ensure accurate and up-to-date results.🔥🔥🔥.

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -50,6 +50,8 @@
 
 ## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 
+- **\[2023.12.10\]** 我们开源了多模评测框架 [VLMEvalKit](https://github.com/open-compass/VLMEvalKit)，目前已支持 20+ 个多模态大模型与包括 MMBench 系列在内的 7 个多模态评测集. 🔥🔥🔥.
+- **\[2023.12.10\]** 我们已经支持了Mistral AI的MoE模型 **Mixtral-8x7B-32K**。欢迎查阅[MixtralKit](https://github.com/open-compass/MixtralKit)以获取更多关于推理和评测的详细信息。🔥🔥🔥。
 - **\[2023.11.22\]** 我们已经支持了多个于API的模型，包括**百度、字节跳动、华为、360**。欢迎查阅[模型](https://opencompass.readthedocs.io/en/latest/user_guides/models.html)部分以获取更多详细信息。🔥🔥🔥。
 - **\[2023.11.20\]** 感谢[helloyongyang](https://github.com/helloyongyang)支持使用[LightLLM](https://github.com/ModelTC/lightllm)作为后端进行评估。欢迎查阅[使用LightLLM进行评估](https://opencompass.readthedocs.io/en/latest/advanced_guides/evaluation_lightllm.html)以获取更多详细信息。🔥🔥🔥。
 - **\[2023.11.13\]** 我们很高兴地宣布发布 OpenCompass v0.1.8 版本。此版本支持本地加载评估基准，从而无需连接互联网。请注意，随着此更新的发布，**您需要重新下载所有评估数据集**，以确保结果准确且最新。🔥🔥🔥。

diff --git a/configs/api_examples/eval_api_360.py b/configs/api_examples/eval_api_360.py
@@ -18,6 +18,13 @@
         type=AI360GPT,
         path='360GPT_S2_V9',
         key="xxxxxxxxxxxx",
+        generation_kwargs={
+            'temperature': 0.9,
+            'max_tokens': 2048,
+            'top_p': 0.5,
+            'tok_k': 0,
+            'repetition_penalty': 1.05,
+        },
         query_per_second=1,
         max_out_len=2048,
         max_seq_len=2048,

diff --git a/configs/api_examples/eval_api_baichuan.py b/configs/api_examples/eval_api_baichuan.py
@@ -20,6 +20,12 @@
         api_key='xxxxxx',
         secret_key="xxxxx",
         url="xxxxx",
+        generation_kwargs={
+            'temperature': 0.3,
+            'top_p': 0.85,
+            'top_k': 5,
+            'with_search_enhance': False,
+        },
         query_per_second=1,
         max_out_len=2048,
         max_seq_len=2048,

diff --git a/configs/api_examples/eval_api_baidu.py b/configs/api_examples/eval_api_baidu.py
@@ -20,10 +20,14 @@
         key='xxxxxx',  # please give you key
         secretkey='xxxxxxxxx',  # please give your group_id
         url='xxxxxxxxx',
+        generation_kwargs = {
+            'temperature': 0.8,
+        },
         query_per_second=1,
         max_out_len=2048,
         max_seq_len=2048,
-        batch_size=8),
+        batch_size=8
+    ),
 ]
 
 infer = dict(

diff --git a/configs/api_examples/eval_api_bytedance.py b/configs/api_examples/eval_api_bytedance.py
@@ -21,6 +21,11 @@
         accesskey="xxxxxxx",
         secretkey="xxxxxxx",
         url='xxxxxx',
+        generation_kwargs={
+            'temperature': 0.7,
+            'top_p': 0.9,
+            'top_k': 0,
+        },
         query_per_second=1,
         max_out_len=2048,
         max_seq_len=2048,

diff --git a/configs/api_examples/eval_api_moonshot.py b/configs/api_examples/eval_api_moonshot.py
@@ -19,6 +19,9 @@
         path='moonshot-v1-32k',
         key='xxxxxxx',
         url= 'xxxxxxxx',
+        system_prompt= '你是 Kimi，由 Moonshot AI 提供的人工智能助手，你更擅长中文和英文的对话。'
+        '你会为用户提供安全，有帮助，准确的回答。同时，你会拒绝一些涉及恐怖主义，种族歧视，'
+        '黄色暴力等问题的回答。Moonshot AI 为专有名词，不可翻译成其他语言。',
         query_per_second=1,
         max_out_len=2048,
         max_seq_len=2048,

diff --git a/configs/datasets/CIBench/CIBench_gen.py b/configs/datasets/CIBench/CIBench_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .CIBench_gen_eb42f9 import ci_datasets  # noqa: F401, F403
+    from .CIBench_gen_8ab0dc import ci_datasets  # noqa: F401, F403
diff --git a/...gs/datasets/CIBench/CIBench_gen_eb42f9.py → ...gs/datasets/CIBench/CIBench_gen_8ab0dc.py b/...gs/datasets/CIBench/CIBench_gen_eb42f9.py → ...gs/datasets/CIBench/CIBench_gen_8ab0dc.py
@@ -16,28 +16,20 @@
         template="""{questions}""",
     ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=AgentInferencer),
+    inferencer=dict(type=AgentInferencer, infer_mode='every'),
 )
 
 
 libs = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
-cibench_eval_cfg = {
-    lib: dict(
-        evaluator=dict(
-            type=CIBenchEvaluator,
-            output_dir=f'output_data/cibench/{lib}'),
-        pred_role="BOT",
-    )
-    for lib in libs
-}
+cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
 
 cibench_datasets = [
     dict(
-        abbr=f"cibench_{lib}",
+        abbr=f"cibench_generation_{lib}",
         type=CIBenchDataset,
         path=f"./data/cibench/{lib}",
         reader_cfg=cibench_reader_cfg,
         infer_cfg=cibench_infer_cfg,
-        eval_cfg=cibench_eval_cfg[lib],
+        eval_cfg=cibench_eval_cfg,
     ) for lib in libs
 ]