From 6b7de1a030e5bf8c32eb66a03a0fc70bb3c2da4a Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Fri, 26 Jan 2024 15:41:10 -0500
Subject: [PATCH 001/112] [ROCm] add support to ROCm 6.0 and MI300 (#2274)

---
 Dockerfile.rocm                               | 36 ++++++++++++++++---
 README.md                                     |  3 +-
 csrc/cuda_utils.h                             |  3 ++
 csrc/cuda_utils_kernels.cu                    | 18 ++++++++++
 csrc/pybind.cpp                               |  6 ++++
 .../getting_started/amd-installation.rst      | 33 +++++++++++++++--
 setup.py                                      |  2 ++
 vllm/utils.py                                 |  8 ++---
 8 files changed, 96 insertions(+), 13 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 36a7ee37fd228..88172fb73b937 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -1,4 +1,24 @@
-FROM rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
+# default base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+FROM $BASE_IMAGE
+
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+RUN echo "Base image is $BASE_IMAGE"
+
+# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
+# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+# this does not always work for all rocm versions
+RUN LLVM_GFX_ARCH=$(/opt/rocm/llvm/bin/amdgpu-offload-arch) && \
+    echo "LLVM_GFX_ARCH is $LLVM_GFX_ARCH"
+
+ARG FA_GFX_ARCHS="gfx90a;gfx942"
+RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
+
+ARG FA_BRANCH="3d2b6f5"
+RUN echo "FA_BRANCH is $FA_BRANCH"
 
 # Install some basic utilities
 RUN apt-get update && apt-get install python3 python3-pip -y
@@ -37,17 +57,23 @@ RUN mkdir libs \
     && cd libs \
     && git clone https://github.com/ROCmSoftwarePlatform/flash-attention.git \
     && cd flash-attention \
-    && git checkout 3d2b6f5 \
+    && git checkout ${FA_BRANCH} \
     && git submodule update --init \
-    && export GPU_ARCHS=$(/opt/rocm/llvm/bin/amdgpu-offload-arch) \
-    && patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch \
+    && export GPU_ARCHS=${FA_GFX_ARCHS} \
+    && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
+        patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
     && python3 setup.py install \
     && cd ..
 
 COPY ./ /app/vllm
 
 RUN python3 -m pip install --upgrade pip
-RUN pip install xformers==0.0.23 --no-deps
+RUN python3 -m pip install xformers==0.0.23 --no-deps
+
+# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+# Manually removed it so that later steps of numpy upgrade can continue
+RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
+    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
 
 RUN cd /app \
     && cd vllm \
diff --git a/README.md b/README.md
index c7ae85e7973db..c0d267b2cbbf3 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,8 @@ Please register [here](https://lu.ma/ygxbpzhl) and join us!
 ---
 
 *Latest News* 🔥
-- [2023/12] Added ROCm support to vLLM.
+- [2024/01] Added ROCm 6.0 support to vLLM.
+- [2023/12] Added ROCm 5.7 support to vLLM.
 - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
 - [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
 - [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
diff --git a/csrc/cuda_utils.h b/csrc/cuda_utils.h
index 69c96cef0d17e..1483484faeb4a 100644
--- a/csrc/cuda_utils.h
+++ b/csrc/cuda_utils.h
@@ -5,3 +5,6 @@
 int get_device_attribute(
     int attribute,
     int device_id);
+
+int get_max_shared_memory_per_block_device_attribute(
+    int device_id);
diff --git a/csrc/cuda_utils_kernels.cu b/csrc/cuda_utils_kernels.cu
index 6c844a7f6c6ed..1a443ef3620cc 100644
--- a/csrc/cuda_utils_kernels.cu
+++ b/csrc/cuda_utils_kernels.cu
@@ -1,5 +1,6 @@
 #ifdef USE_ROCM
   #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
 #endif
 int get_device_attribute(
     int attribute,
@@ -15,3 +16,20 @@ int get_device_attribute(
     cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute), device);
     return value;
 }
+
+
+int get_max_shared_memory_per_block_device_attribute(
+    int device_id)
+{
+int attribute;    
+// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
+// cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
+
+#ifdef USE_ROCM
+    attribute = hipDeviceAttributeMaxSharedMemoryPerBlock;
+#else
+    attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin;
+#endif
+
+    return get_device_attribute(attribute, device_id);
+}
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index 95f557686f337..e6683c446154d 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -81,4 +81,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     "get_device_attribute",
     &get_device_attribute,
     "Gets the specified device attribute.");
+
+  cuda_utils.def(
+    "get_max_shared_memory_per_block_device_attribute",
+    &get_max_shared_memory_per_block_device_attribute,
+    "Gets the maximum shared memory per block device attribute.");
+
 }
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index 181c970e0b2a7..6851ba136351c 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -11,10 +11,10 @@ Requirements
 ------------
 
 * OS: Linux
-* Python: 3.8 -- 3.11 (Verified on 3.10)
-* GPU: MI200s
+* Python: 3.8 -- 3.11
+* GPU: MI200s (gfx90a), MI300 (gfx942)
 * Pytorch 2.0.1/2.1.1/2.2
-* ROCm 5.7
+* ROCm 5.7 (Verified on python 3.10) or ROCm 6.0 (Verified on python 3.9)
 
 Installation options:
 
@@ -27,6 +27,8 @@ Installation options:
 (Recommended) Option 1: Quick start with vLLM pre-installed in Docker Image
 ---------------------------------------------------------------------------
 
+This option is for ROCm 5.7 only:
+
 .. code-block:: console
 
     $ docker pull embeddedllminfo/vllm-rocm:vllm-v0.2.4
@@ -50,6 +52,9 @@ Option 2: Build from source
 
 You can build and install vLLM from source:
 
+Below instruction is for ROCm 5.7 only. 
+At the time of this documentation update, PyTorch on ROCm 6.0 wheel is not yet available on the PyTorch website.
+
 0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
 
 - `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_
@@ -95,6 +100,23 @@ You can build and install vLLM from source:
 
 Build a docker image from `Dockerfile.rocm`, and launch a docker container.
 
+The `Dokerfile.rocm` is designed to support both ROCm 5.7 and ROCm 6.0 and later versions. It provides flexibility to customize the build of docker image using the following arguments:
+
+* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`
+* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
+* `FA_BRANCH`: specifies the branch used to build the flash-attention in `ROCmSoftwarePlatform's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `3d2b6f5`
+
+Their values can be passed in when running ``docker build`` with ``--build-arg`` options.
+
+For example, to build docker image for vllm on ROCm 5.7, you can run:
+
+.. code-block:: console
+
+    $ docker build --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
+       -f Dockerfile.rocm -t vllm-rocm . 
+
+To build vllm on ROCm 6.0, you can use the default:
+
 .. code-block:: console
 
     $ docker build -f Dockerfile.rocm -t vllm-rocm . 
@@ -142,3 +164,8 @@ Alternatively, if you plan to install vLLM-ROCm on a local machine or start from
         $ cd vllm
         $ pip install -U -r requirements-rocm.txt
         $ python setup.py install # This may take 5-10 minutes.
+
+.. note::
+
+    - You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation.
+
diff --git a/setup.py b/setup.py
index 3baf27aa86532..5a3f262c1658e 100644
--- a/setup.py
+++ b/setup.py
@@ -51,6 +51,8 @@ def _is_cuda() -> bool:
             "Cannot find ROCM_HOME. ROCm must be available to build the package."
         )
     NVCC_FLAGS += ["-DUSE_ROCM"]
+    NVCC_FLAGS += [f"-U__HIP_NO_HALF_CONVERSIONS__"]
+    NVCC_FLAGS += [f"-U__HIP_NO_HALF_OPERATORS__"]
 
 if _is_cuda() and CUDA_HOME is None:
     raise RuntimeError(
diff --git a/vllm/utils.py b/vllm/utils.py
index 23b6ca320d300..6a9508f6d33b4 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -112,10 +112,10 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     # the Neuron-X backend does not have the `cuda_utils` module.
     from vllm._C import cuda_utils
 
-    # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
-    cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
-    max_shared_mem = cuda_utils.get_device_attribute(
-        cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu)
+    max_shared_mem = cuda_utils.get_max_shared_memory_per_block_device_attribute(
+        gpu)
+    # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py will fail
+    assert max_shared_mem > 0, "max_shared_mem can not be zero"
     return int(max_shared_mem)
 
 

From 3a0e1fc070dc7482ab1c8fcdc961e5729a4cb0b3 Mon Sep 17 00:00:00 2001
From: dakotamahan-stability
 <139925645+dakotamahan-stability@users.noreply.github.com>
Date: Fri, 26 Jan 2024 14:45:19 -0600
Subject: [PATCH 002/112] Support for Stable LM 2 (#2598)

Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
---
 vllm/model_executor/models/stablelm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index cf842d087669f..95e5ad8ede63e 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -98,7 +98,7 @@ def __init__(self,
         self.scaling = self.head_dim**-0.5
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_key_value_heads * self.head_dim
-
+        self.qkv_bias = getattr(config, "use_qkv_bias", False)
         if (self.head_dim * self.num_heads * tp_size) != self.hidden_size:
             raise ValueError(
                 f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
@@ -108,7 +108,7 @@ def __init__(self,
                                           self.head_dim,
                                           self.total_num_heads,
                                           self.total_num_key_value_heads,
-                                          bias=False,
+                                          self.qkv_bias,
                                           linear_method=linear_method)
         self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
                                         self.hidden_size,

From 390b495ff327e8548c3f7cd701afce87870d9102 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Fri, 26 Jan 2024 15:19:19 -0800
Subject: [PATCH 003/112] Don't build punica kernels by default (#2605)

---
 .github/workflows/scripts/build.sh | 2 ++
 Dockerfile                         | 2 ++
 setup.py                           | 2 +-
 vllm/lora/punica.py                | 9 ++++++---
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index cf3bc11823b43..2578d448436d2 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -13,6 +13,8 @@ $python_executable -m pip install -r requirements.txt
 
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
+# Make sure punica is built for the release (for LoRA)
+export VLLM_INSTALL_PUNICA_KERNELS=1
 
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
diff --git a/Dockerfile b/Dockerfile
index 44b1dd17d7e02..4cfcf058004c5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,6 +45,8 @@ ENV MAX_JOBS=${max_jobs}
 # number of threads used by nvcc
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
 RUN python3 setup.py build_ext --inplace
 #################### EXTENSION Build IMAGE ####################
diff --git a/setup.py b/setup.py
index 5a3f262c1658e..88fa495205659 100644
--- a/setup.py
+++ b/setup.py
@@ -265,7 +265,7 @@ def get_torch_arch_list() -> Set[str]:
         with contextlib.suppress(ValueError):
             torch_cpp_ext.COMMON_NVCC_FLAGS.remove(flag)
 
-    install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "1")))
+    install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
     device_count = torch.cuda.device_count()
     for i in range(device_count):
         major, minor = torch.cuda.get_device_capability(i)
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index ac96931b2d071..bcb73ccc19b0e 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -157,10 +157,13 @@ def _raise_exc(
         **kwargs  # pylint: disable=unused-argument
     ):
         if torch.cuda.get_device_capability() < (8, 0):
-            raise ImportError(
-                "LoRA kernels require compute capability>=8.0") from import_exc
+            raise ImportError("punica LoRA kernels require compute "
+                              "capability>=8.0") from import_exc
         else:
-            raise import_exc
+            raise ImportError(
+                "punica LoRA kernels could not be imported. If you built vLLM "
+                "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
+                "was set.") from import_exc
 
     bgmv = _raise_exc
     add_lora = _raise_exc

From beb89f68b448a43ac112b48e3834f80a2df626cb Mon Sep 17 00:00:00 2001
From: Casper <casperbh.96@gmail.com>
Date: Sat, 27 Jan 2024 08:53:17 +0100
Subject: [PATCH 004/112] AWQ: Up to 2.66x higher throughput (#2566)

---
 csrc/ops.h                                    |   8 ++
 csrc/pybind.cpp                               |   1 +
 csrc/quantization/awq/gemm_kernels.cu         | 108 ++++++++++++++++++
 .../model_executor/layers/quantization/awq.py |  11 +-
 4 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 9340a60da1417..d49619644b182 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -70,6 +70,14 @@ torch::Tensor awq_gemm(
   torch::Tensor _scaling_factors,
   torch::Tensor _zeros,
   int split_k_iters);
+
+torch::Tensor awq_dequantize(
+    torch::Tensor _kernel,
+    torch::Tensor _scaling_factors,
+    torch::Tensor _zeros,
+    int split_k_iters,
+    int thx,
+    int thy);
 #endif
 
 void squeezellm_gemm(
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index e6683c446154d..88af7eac8a28f 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -51,6 +51,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 #ifndef USE_ROCM
   // Quantization ops
   ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
+  ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
 #endif
   ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
   ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
diff --git a/csrc/quantization/awq/gemm_kernels.cu b/csrc/quantization/awq/gemm_kernels.cu
index 04dfe8fe9b889..376c8ebfb9b7a 100644
--- a/csrc/quantization/awq/gemm_kernels.cu
+++ b/csrc/quantization/awq/gemm_kernels.cu
@@ -493,9 +493,117 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, in
 #endif
 }
 
+__global__ void __launch_bounds__(64) dequantize_weights(
+    int* __restrict__ B,
+    half* __restrict__ scaling_factors,
+    int* __restrict__ zeros,
+    half* __restrict__ C,
+    int G
+)
+{
+  int j_factors1 = 4;
+  int row_stride2 = 4;
+  int split_k_iters = 1;
+  static constexpr uint32_t ZERO = 0x0;
+  half B_shared[32 * (128 + 8)];
+
+  half* B_shared_ptr2 = B_shared;
+
+  half B_shared_warp[32];
+  int OC = 512;
+
+  int N = blockDim.x * gridDim.x;  // 2
+  int col = (blockIdx.x * blockDim.x + threadIdx.x);
+  int row = blockIdx.y * blockDim.y + threadIdx.y;
+  int index1 = 8 * col + 8 * row * N;
+  half* C_ptr2 = C + index1;
+
+  int index2 = col + row * N;
+  int* B_ptr2 = B + index2;
+
+  int index3 = col + (int)(row / G) * N;
+  int* zeros_ptr2 = zeros + index3;
+  int index4 = 8 * col + (int)(row / G) * N * 8;
+  half* scaling_factors_ptr2 = scaling_factors + index4;
+
+
+    uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr2);
+    uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
+    uint4 B_loaded_scale = *(uint4*)(scaling_factors_ptr2);
+int j=0;
+
+      uint32_t B_loaded = *(uint32_t*)(B_ptr2 + j);
+      uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
+      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
+      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
+
+      *(uint4*)(B_shared_ptr2 + j) = B_loaded_fp16;
+
+  for (int i=0; i<8; ++i) {
+    *(C_ptr2 + i) = B_shared[i];
+  }
+}
+
 } // namespace awq
 } // namespace vllm
 
+torch::Tensor awq_dequantize(
+    torch::Tensor _kernel,
+    torch::Tensor _scaling_factors,
+    torch::Tensor _zeros,
+    int split_k_iters,
+    int thx,
+    int thy)
+{
+    int in_c = _kernel.size(0);
+    int qout_c = _kernel.size(1);
+    int out_c = qout_c * 8;
+    int G = in_c / _scaling_factors.size(0);
+
+    int x_thread = thx;
+    int y_thread = thy;
+
+    int x_blocks = 1;
+    int y_blocks = 1;
+    if (thx==0) {
+      x_thread = qout_c;
+    }
+    if (thy==0) {
+      y_thread = in_c;
+    }
+    if (thx==0 && thy==0) {
+      x_thread = 8;
+      y_thread = 8;
+      x_blocks = (int)(qout_c / 8);
+      y_blocks = (int)(in_c / 8);
+    }
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(_scaling_factors));
+
+    auto options = torch::TensorOptions().dtype(_scaling_factors.dtype()).device(_scaling_factors.device());
+    at::Tensor _de_kernel = torch::empty({in_c, out_c}, options);
+
+    auto kernel = reinterpret_cast<int*>(_kernel.data_ptr<int>());
+    auto de_kernel = reinterpret_cast<half*>(_de_kernel.data_ptr<at::Half>());
+    auto scaling_factors = reinterpret_cast<half*>(_scaling_factors.data_ptr<at::Half>());
+    auto zeros = reinterpret_cast<int*>(_zeros.data_ptr<int>());
+
+    dim3 num_blocks(x_blocks, y_blocks);
+    dim3 threads_per_block(x_thread, y_thread);
+
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    vllm::awq::dequantize_weights<<<num_blocks, threads_per_block, 0, stream>>>(
+        kernel, scaling_factors, zeros, de_kernel, G);
+
+    return _de_kernel;
+}
+
 // in_feats: M, IC [float16]
 // kernel: IC, OC // 8 [int32] -> cast to IC, OC [uint4b]
 // scaling_factors: IC // G, OC [float16]
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 831576b1d7cd7..4d3fd3ec0cc71 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -153,7 +153,16 @@ def apply_weights(self,
         pack_factor = self.quant_config.pack_factor
         out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, ))
         reshaped_x = x.reshape(-1, x.shape[-1])
-        out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros, pack_factor)
+
+        # num_tokens >= threshold
+        FP16_MATMUL_HEURISTIC_CONDITION = x.shape[:-1].numel() >= 256
+
+        if FP16_MATMUL_HEURISTIC_CONDITION:
+            out = ops.awq_dequantize(qweight, scales, qzeros, 0, 0, 0)
+            out = torch.matmul(reshaped_x, out)
+        else:
+            out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros,
+                               pack_factor)
         if bias is not None:
             out = out + bias
         return out.reshape(out_shape)

From 220a47627bf48c728ce0a2737be39c400bb6f653 Mon Sep 17 00:00:00 2001
From: Xiang Xu <117880274+xiangxu-google@users.noreply.github.com>
Date: Sat, 27 Jan 2024 10:30:49 -0800
Subject: [PATCH 005/112] Use head_dim in config if exists (#2622)

---
 vllm/config.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index 8acd15a3b7d9a..4f1ce87cb615b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -213,6 +213,8 @@ def get_hidden_size(self) -> int:
         return self.hf_config.hidden_size
 
     def get_head_size(self) -> int:
+        if hasattr(self.hf_config, "head_dim"):
+            return self.hf_config.head_dim
         # FIXME(woosuk): This may not be true for all models.
         return self.hf_config.hidden_size // self.hf_config.num_attention_heads
 

From 380170038e05cf81953c29d7e8ed789e048b6434 Mon Sep 17 00:00:00 2001
From: Hanzhi Zhou <hanzhi713@163.com>
Date: Sun, 28 Jan 2024 04:46:35 +0800
Subject: [PATCH 006/112] Implement custom all reduce kernels (#2192)

---
 csrc/custom_all_reduce.cu                     | 148 +++++
 csrc/custom_all_reduce.cuh                    | 555 ++++++++++++++++++
 csrc/custom_all_reduce_test.cu                | 284 +++++++++
 csrc/ops.h                                    |  22 +
 csrc/pybind.cpp                               |  16 +
 requirements.txt                              |   1 +
 setup.py                                      |   6 +-
 tests/distributed/test_comm_ops.py            |  30 +-
 tests/distributed/test_custom_all_reduce.py   |  85 +++
 vllm/config.py                                |  14 +
 vllm/engine/arg_utils.py                      |   8 +-
 vllm/engine/llm_engine.py                     |   1 +
 vllm/entrypoints/llm.py                       |   3 +
 .../parallel_utils/communication_op.py        |  14 +-
 .../parallel_utils/custom_all_reduce.py       | 223 +++++++
 vllm/test_utils.py                            |  38 ++
 vllm/worker/model_runner.py                   |  60 +-
 vllm/worker/worker.py                         |  10 +-
 18 files changed, 1453 insertions(+), 65 deletions(-)
 create mode 100644 csrc/custom_all_reduce.cu
 create mode 100644 csrc/custom_all_reduce.cuh
 create mode 100644 csrc/custom_all_reduce_test.cu
 create mode 100644 tests/distributed/test_custom_all_reduce.py
 create mode 100644 vllm/model_executor/parallel_utils/custom_all_reduce.py
 create mode 100644 vllm/test_utils.py

diff --git a/csrc/custom_all_reduce.cu b/csrc/custom_all_reduce.cu
new file mode 100644
index 0000000000000..88e4af9d4a99f
--- /dev/null
+++ b/csrc/custom_all_reduce.cu
@@ -0,0 +1,148 @@
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+
+#include "custom_all_reduce.cuh"
+
+// fake pointer type
+using fptr_t = uint64_t;
+static_assert(sizeof(void *) == sizeof(fptr_t));
+
+fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data,
+                      const std::vector<std::string> &handles,
+                      const std::vector<int64_t> &offsets, int rank,
+                      bool full_nvlink) {
+  int world_size = offsets.size();
+  if (world_size > 8)
+    throw std::invalid_argument("world size > 8 is not supported");
+  if (world_size % 2 != 0)
+    throw std::invalid_argument("Odd num gpus is not supported for now");
+  if (world_size != handles.size())
+    throw std::invalid_argument(
+        "handles length should equal to offsets length");
+  if (rank < 0 || rank >= world_size)
+    throw std::invalid_argument("invalid rank passed in");
+
+  cudaIpcMemHandle_t ipc_handles[8];
+  for (int i = 0; i < world_size; i++) {
+    std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(cudaIpcMemHandle_t));
+  }
+  return (fptr_t) new vllm::CustomAllreduce(
+      reinterpret_cast<vllm::Metadata *>(meta.data_ptr()), rank_data.data_ptr(),
+      rank_data.numel(), ipc_handles, offsets, rank, full_nvlink);
+}
+
+/**
+ * Make sure tensor t's data lies completely within ((char)t.data_ptr()) +
+ * t.numel() * t.element_size(). This is slightly weaker than t.is_contiguous()
+ * because it allows transpose of contiguous slice (i.e. slicing the first
+ * dimension). Currently, we require this because stride information is not
+ * passed into the kernels and we treat input tensors as flat.
+ *
+ * Examples
+ * A = torch.zeros(3, 3, 3)
+ * 1. A: OK
+ * 2. A[1:]: OK
+ * 3. A.permute(2, 0, 1): OK
+ * 4. A[1:].permute(2, 0, 1): OK
+ * 5. A[None].expand(2, -1, -1, -1): Not OK
+ * 6. A[:, 1:, 1:]: Not OK
+ */
+bool _is_weak_contiguous(torch::Tensor &t) {
+  return t.is_contiguous() ||
+         (t.storage().nbytes() - t.storage_offset() * t.element_size() ==
+          t.numel() * t.element_size());
+}
+
+bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size,
+                      bool full_nvlink) {
+  auto inp_size = inp.numel() * inp.element_size();
+  // custom allreduce requires input byte size to be multiples of 16
+  if (inp_size % 16 != 0) return false;
+  if (!_is_weak_contiguous(inp)) return false;
+  if (world_size == 2 || full_nvlink) return inp_size <= max_size;
+  // 4 PCIE GPUs use 2 stage allreduce, and is only faster than NCCL when size
+  // <= 512k
+  return world_size <= 4 && inp_size <= 512 * 1024;
+}
+
+void _all_reduce(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out,
+                 cudaStream_t stream) {
+  auto fa = reinterpret_cast<vllm::CustomAllreduce *>(_fa);
+  TORCH_CHECK(_is_weak_contiguous(out));
+  switch (out.scalar_type()) {
+    case at::ScalarType::Float: {
+      fa->allreduce<float>(stream, reinterpret_cast<float *>(inp.data_ptr()),
+                           reinterpret_cast<float *>(out.data_ptr()),
+                           out.numel());
+      break;
+    }
+    case at::ScalarType::Half: {
+      fa->allreduce<half>(stream, reinterpret_cast<half *>(inp.data_ptr()),
+                          reinterpret_cast<half *>(out.data_ptr()),
+                          out.numel());
+      break;
+    }
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+    case at::ScalarType::BFloat16: {
+      fa->allreduce<nv_bfloat16>(
+          stream, reinterpret_cast<nv_bfloat16 *>(inp.data_ptr()),
+          reinterpret_cast<nv_bfloat16 *>(out.data_ptr()), out.numel());
+      break;
+    }
+#endif
+    default:
+      throw std::runtime_error(
+          "custom allreduce only supports float32, float16 and bfloat16");
+  }
+}
+
+void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
+  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+  TORCH_CHECK_EQ(inp.numel(), out.numel());
+  _all_reduce(_fa, inp, out, stream);
+}
+
+void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &reg_buffer,
+                      torch::Tensor &out) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
+  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+
+  auto input_size = inp.numel() * inp.element_size();
+  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+  TORCH_CHECK_EQ(inp.numel(), out.numel());
+  TORCH_CHECK(input_size <= reg_buffer.numel() * reg_buffer.element_size(),
+              "registered buffer is too small to contain the input");
+  AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer.data_ptr(), inp.data_ptr(),
+                                input_size, cudaMemcpyDeviceToDevice, stream));
+  _all_reduce(_fa, reg_buffer, out, stream);
+}
+
+void dispose(fptr_t _fa) {
+  auto fa = reinterpret_cast<vllm::CustomAllreduce *>(_fa);
+  delete fa;
+}
+
+int meta_size() { return sizeof(vllm::Metadata); }
+
+void register_buffer(fptr_t _fa, torch::Tensor &t,
+                     const std::vector<std::string> &handles,
+                     const std::vector<int64_t> &offsets) {
+  auto fa = reinterpret_cast<vllm::CustomAllreduce *>(_fa);
+  fa->register_buffer(handles, offsets, t.data_ptr());
+}
+
+std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(
+    fptr_t _fa) {
+  auto fa = reinterpret_cast<vllm::CustomAllreduce *>(_fa);
+  return fa->get_graph_buffer_ipc_meta();
+}
+
+void register_graph_buffers(fptr_t _fa, const std::vector<std::string> &handles,
+                            const std::vector<std::vector<int64_t>> &offsets) {
+  auto fa = reinterpret_cast<vllm::CustomAllreduce *>(_fa);
+  fa->register_graph_buffers(handles, offsets);
+}
diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
new file mode 100644
index 0000000000000..6e71bb9a9c6e8
--- /dev/null
+++ b/csrc/custom_all_reduce.cuh
@@ -0,0 +1,555 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <limits>
+#include <unordered_map>
+#include <vector>
+
+#define CUDACHECK(cmd)                                              \
+  do {                                                              \
+    cudaError_t e = cmd;                                            \
+    if (e != cudaSuccess) {                                         \
+      printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
+             cudaGetErrorString(e));                                \
+      exit(EXIT_FAILURE);                                           \
+    }                                                               \
+  } while (0)
+
+namespace vllm {
+
+struct Signal {
+  alignas(64) union {
+    uint64_t flag;
+    unsigned char data[8];
+  } start;
+  alignas(64) union {
+    uint64_t flag;
+    unsigned char data[8];
+  } end;
+};
+
+struct Metadata {
+  alignas(128) Signal sg;
+  alignas(128) int counter;
+};
+static_assert(offsetof(Metadata, counter) == 128);
+static_assert(sizeof(Metadata) == 256);
+
+struct __align__(16) RankData { const void *__restrict__ ptrs[8]; };
+
+struct RankSignals {
+  volatile Signal *signals[8];
+};
+
+// like std::array, but aligned
+template <typename T, int sz>
+struct __align__(alignof(T) * sz) array_t {
+  T data[sz];
+  using type = T;
+  static constexpr int size = sz;
+};
+
+// use packed type to maximize memory efficiency
+// goal: generate ld.128 and st.128 instructions
+template <typename T>
+struct packed_t {
+  // the (P)acked type for load/store
+  using P = array_t<T, 16 / sizeof(T)>;
+  // the (A)ccumulator type for reduction
+  using A = array_t<float, 16 / sizeof(T)>;
+};
+
+#define DINLINE __device__ __forceinline__
+
+// scalar cast functions
+DINLINE float upcast_s(half val) { return __half2float(val); }
+
+template <typename T>
+DINLINE T downcast_s(float val);
+template <>
+DINLINE half downcast_s(float val) {
+  return __float2half(val);
+}
+
+// scalar add functions
+// for some reason when compiling with Pytorch, the + operator for half and
+// bfloat is disabled so we call the intrinsics directly
+DINLINE half &assign_add(half &a, half b) {
+  a = __hadd(a, b);
+  return a;
+}
+DINLINE float &assign_add(float &a, float b) { return a += b; }
+
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+DINLINE float upcast_s(nv_bfloat16 val) { return __bfloat162float(val); }
+template <>
+DINLINE nv_bfloat16 downcast_s(float val) {
+  return __float2bfloat16(val);
+}
+DINLINE nv_bfloat16 &assign_add(nv_bfloat16 &a, nv_bfloat16 b) {
+  a = __hadd(a, b);
+  return a;
+}
+#endif
+
+template <typename T, int N>
+DINLINE array_t<T, N> &packed_assign_add(array_t<T, N> &a, array_t<T, N> b) {
+#pragma unroll
+  for (int i = 0; i < N; i++) {
+    assign_add(a.data[i], b.data[i]);
+  }
+  return a;
+}
+
+template <typename T, int N>
+DINLINE array_t<float, N> upcast(array_t<T, N> val) {
+  if constexpr (std::is_same<T, float>::value) {
+    return val;
+  } else {
+    array_t<float, N> out;
+#pragma unroll
+    for (int i = 0; i < N; i++) {
+      out.data[i] = upcast_s(val.data[i]);
+    }
+    return out;
+  }
+}
+
+template <typename O>
+DINLINE O downcast(array_t<float, O::size> val) {
+  if constexpr (std::is_same<typename O::type, float>::value) {
+    return val;
+  } else {
+    O out;
+#pragma unroll
+    for (int i = 0; i < O::size; i++) {
+      out.data[i] = downcast_s<typename O::type>(val.data[i]);
+    }
+    return out;
+  }
+}
+
+// compute flag at compile time
+__host__ __device__ constexpr uint64_t compute_flag(int ngpus) {
+  auto m = std::numeric_limits<uint64_t>::max();
+  return m >> ((8 - ngpus) * 8);
+}
+
+template <int ngpus>
+DINLINE void start_sync(const RankSignals &sg, volatile Metadata *meta,
+                        int rank) {
+  constexpr auto FLAG = compute_flag(ngpus);
+  if (blockIdx.x == 0) {
+    if (threadIdx.x < ngpus)
+      // simultaneously write to the corresponding byte to all other ranks.
+      // Latency = 1 p2p write
+      sg.signals[threadIdx.x]->start.data[rank] = 255;
+    else if (threadIdx.x == 32)
+      // reset
+      meta->sg.end.flag = 0;
+  }
+  if (threadIdx.x == 0) {
+    while (meta->sg.start.flag != FLAG)
+      ;
+  }
+  __syncthreads();
+}
+
+template <int ngpus, bool final_sync = false>
+DINLINE void end_sync(const RankSignals &sg, volatile Metadata *meta,
+                      int rank) {
+  constexpr auto FLAG = compute_flag(ngpus);
+  __syncthreads();
+  __shared__ int num;
+  if (threadIdx.x == 0) num = atomicAdd((int *)&meta->counter, 1);
+  __syncthreads();
+
+  // Only the last completing block can perform the end synchronization
+  // This can ensures when the final busy wait ends, all ranks must have
+  // finished reading each other's buffer.
+  if (num == gridDim.x - 1) {
+    if (threadIdx.x == 32) {
+      // reset in a different warp
+      meta->counter = 0;
+      meta->sg.start.flag = 0;
+    } else if (threadIdx.x < ngpus) {
+      // simultaneously write to the corresponding byte to all other ranks.
+      // Latency = 1 p2p write
+      sg.signals[threadIdx.x]->end.data[rank] = 255;
+    }
+    // if this is the final sync, only one block needs it
+    // because kernel exit can serve as sync
+    if constexpr (final_sync) {
+      if (threadIdx.x == 0) {
+        while (meta->sg.end.flag != FLAG)
+          ;
+      }
+    }
+  }
+  if constexpr (!final_sync) {
+    if (threadIdx.x == 0) {
+      while (meta->sg.end.flag != FLAG)
+        ;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename P, int ngpus, typename A>
+DINLINE P packed_reduce(const P *ptrs[], int idx) {
+  A tmp = upcast(ptrs[0][idx]);
+#pragma unroll
+  for (int i = 1; i < ngpus; i++) {
+    packed_assign_add(tmp, upcast(ptrs[i][idx]));
+  }
+  return downcast<P>(tmp);
+}
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1)
+    cross_device_reduce_1stage(RankData *_dp, RankSignals sg,
+                               volatile Metadata *meta, T *__restrict__ result,
+                               int rank, int size) {
+  using P = typename packed_t<T>::P;
+  using A = typename packed_t<T>::A;
+  // note: we don't reorder the address so the accumulation order is the same
+  // for all ranks, ensuring bitwise identical results
+  auto dp = *_dp;
+  start_sync<ngpus>(sg, meta, rank);
+  // do the actual reduction
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    ((P *)result)[idx] =
+        packed_reduce<P, ngpus, A>((const P **)&dp.ptrs[0], idx);
+  }
+  end_sync<ngpus, true>(sg, meta, rank);
+}
+
+template <typename P>
+DINLINE P *get_tmp_buf(volatile Signal *sg) {
+  return (P *)(((Metadata *)sg) + 1);
+}
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1)
+    cross_device_reduce_2stage(RankData *_dp, RankSignals sg,
+                               volatile Metadata *meta, T *__restrict__ result,
+                               int rank, int size) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = gridDim.x * blockDim.x;
+  using P = typename packed_t<T>::P;
+  using A = typename packed_t<T>::A;
+  int part = size / ngpus;
+  int start = rank * part;
+  int end = rank == ngpus - 1 ? size : start + part;
+  const P *ptrs[ngpus];
+  P *tmps[ngpus];
+#pragma unroll
+  for (int i = 0; i < ngpus; i++) {
+    int target = (rank + i) % ngpus;
+    ptrs[i] = (const P *)_dp->ptrs[target];
+    tmps[i] = get_tmp_buf<P>(sg.signals[target]);
+  }
+  auto tmp_out = tmps[0];
+  start_sync<ngpus>(sg, meta, rank);
+  // stage 1: reduce scatter
+  for (int idx = start + tid; idx < end; idx += stride) {
+    tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
+  }
+  // Maybe TODO: replace this with per-block release-acquire
+  // can save about 1-2us (not a lot though)
+  end_sync<ngpus>(sg, meta, rank);
+
+  // stage 2: allgather
+  for (int idx = tid; idx < part; idx += stride) {
+#pragma unroll
+    for (int i = 0; i < ngpus; i++) {
+      int dst_idx = ((rank + i) % ngpus) * part + idx;
+      ((P *)result)[dst_idx] = tmps[i][idx];
+    }
+  }
+  // process the last larger partition
+  int remaining = size - part * ngpus;
+  if (tid < remaining) {
+    int dst_idx = tid + part * ngpus;
+    ((P *)result)[dst_idx] = get_tmp_buf<P>(sg.signals[ngpus - 1])[part + tid];
+  }
+
+  // faster than this
+  // for (int idx = tid; idx < size; idx += stride) {
+  //   int target_rank = idx / part;
+  //   if (target_rank == ngpus) target_rank -= 1;
+  //   ((P *)result)[idx] = tmps[target_rank][idx - target_rank * part];
+  // }
+}
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1)
+    cross_device_reduce_half_butterfly(RankData *_dp, RankSignals sg,
+                                       volatile Metadata *meta,
+                                       T *__restrict__ result, int rank,
+                                       int size) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = gridDim.x * blockDim.x;
+  using P = typename packed_t<T>::P;
+  using A = typename packed_t<T>::A;
+  auto tmp_out = get_tmp_buf<P>(sg.signals[rank]);
+  constexpr int hg = ngpus / 2;
+  // Actually not quite half butterfly.
+  // This is an all-to-all within each group containing half of the ranks
+  // followed by cross-group add. Equivalent to half butterfly when there
+  // are 4 GPUs, a common case for PCIe cards like T4 and A10.
+  const P *ptrs[hg];
+  {
+    int start = rank - rank % hg;
+#pragma unroll
+    for (int i = 0; i < hg; i++) {
+      ptrs[i] = (const P *)_dp->ptrs[i + start];
+    }
+  }
+  start_sync<ngpus>(sg, meta, rank);
+  for (int idx = tid; idx < size; idx += stride) {
+    tmp_out[idx] = packed_reduce<P, hg, A>(ptrs, idx);
+  }
+  end_sync<ngpus>(sg, meta, rank);
+
+  auto src = get_tmp_buf<P>(sg.signals[(ngpus - 1) - rank % ngpus]);
+  // do the cross group reduction
+  for (int idx = tid; idx < size; idx += stride) {
+    auto tmp = tmp_out[idx];
+    packed_assign_add(tmp, src[idx]);
+    ((P *)result)[idx] = tmp;
+  }
+}
+
+class CustomAllreduce {
+ public:
+  int rank_;
+  int world_size_;
+  bool full_nvlink_;
+
+  // below are device pointers
+  RankSignals sg_;
+  std::unordered_map<void *, RankData *> buffers_;
+  Metadata *meta_;
+
+  // stores the registered device pointers from all ranks
+  RankData *d_rank_data_base_, *d_rank_data_end_;
+  std::vector<void *> graph_unreg_buffers_;
+  std::vector<void *> ipc_handles_;
+
+  /**
+   * meta is a pointer to device metadata and temporary buffer for allreduce.
+   *
+   * There's a total of sizeof(Metadata) of prefix before the actual data,
+   * so meta + 1 points to actual temporary buffer.
+   *
+   * note: this class does not own any device memory. Any required buffers
+   * are passed in from the constructor
+   */
+  CustomAllreduce(Metadata *meta, void *rank_data, size_t rank_data_sz,
+                  const cudaIpcMemHandle_t *handles,
+                  const std::vector<int64_t> &offsets, int rank,
+                  bool full_nvlink = true)
+      : rank_(rank),
+        world_size_(offsets.size()),
+        full_nvlink_(full_nvlink),
+        meta_(meta),
+        d_rank_data_base_(reinterpret_cast<RankData *>(rank_data)),
+        d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
+    for (int i = 0; i < world_size_; i++) {
+      Metadata *rank_meta;
+      if (i != rank_) {
+        char *handle;
+        CUDACHECK(cudaIpcOpenMemHandle((void **)&handle, handles[i],
+                                       cudaIpcMemLazyEnablePeerAccess));
+        ipc_handles_.push_back(handle);
+        handle += offsets[i];
+        rank_meta = (Metadata *)handle;
+      } else {
+        rank_meta = meta_;
+      }
+      sg_.signals[i] = &rank_meta->sg;
+    }
+  }
+
+  std::pair<std::vector<uint8_t>, std::vector<int64_t>>
+  get_graph_buffer_ipc_meta() {
+    auto num_buffers = graph_unreg_buffers_.size();
+    auto handle_sz = sizeof(cudaIpcMemHandle_t);
+    std::vector<uint8_t> handles(handle_sz * num_buffers, 0);
+    std::vector<int64_t> offsets(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto ptr = graph_unreg_buffers_[i];
+      void *base_ptr;
+      // note: must share the base address of each allocation, or we get wrong
+      // address
+      if (cuPointerGetAttribute(&base_ptr,
+                                CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+                                (CUdeviceptr)ptr) != CUDA_SUCCESS)
+        throw std::runtime_error("failed to get pointer attr");
+      CUDACHECK(cudaIpcGetMemHandle(
+          (cudaIpcMemHandle_t *)&handles[i * handle_sz], base_ptr));
+      offsets[i] = ((char *)ptr) - ((char *)base_ptr);
+    }
+    return std::make_pair(handles, offsets);
+  }
+
+  void check_rank_data_capacity(size_t num = 1) {
+    if (d_rank_data_base_ + num > d_rank_data_end_)
+      throw std::runtime_error(
+          "Rank data buffer is overflowed by " +
+          std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
+  }
+
+  void register_buffer(const std::vector<std::string> &handles,
+                       const std::vector<int64_t> &offsets, void *self) {
+    check_rank_data_capacity();
+    RankData data;
+    for (int i = 0; i < world_size_; i++) {
+      if (i != rank_) {
+        char *handle;
+        CUDACHECK(cudaIpcOpenMemHandle(
+            (void **)&handle, *((const cudaIpcMemHandle_t *)handles[i].data()),
+            cudaIpcMemLazyEnablePeerAccess));
+        ipc_handles_.push_back(handle);
+        handle += offsets[i];
+        data.ptrs[i] = handle;
+      } else {
+        data.ptrs[i] = self;
+      }
+    }
+    auto d_data = d_rank_data_base_++;
+    CUDACHECK(
+        cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice));
+    buffers_[self] = d_data;
+  }
+
+  // note: when registering graph buffers, we intentionally choose to not
+  // deduplicate the addresses. That means if the allocator reuses some
+  // addresses, they will be registered again. This is to account for the remote
+  // possibility of different allocation patterns between ranks. For example,
+  // rank 1 may get the same input address for the second allreduce, but rank 2
+  // got a different address. IPC handles have internal reference counting
+  // mechanism so overhead should be small.
+  void register_graph_buffers(
+      const std::vector<std::string> &handles,
+      const std::vector<std::vector<int64_t>> &offsets) {
+    auto num_buffers = graph_unreg_buffers_.size();
+    check_rank_data_capacity(num_buffers);
+    std::vector<RankData> rank_data(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto self_ptr = graph_unreg_buffers_[i];
+      auto &rd = rank_data[i];
+      for (int j = 0; j < world_size_; j++) {
+        if (j != rank_) {
+          char *handle;
+          CUDACHECK(cudaIpcOpenMemHandle(
+              (void **)&handle,
+              *((cudaIpcMemHandle_t *)&handles[j]
+                                              [i * sizeof(cudaIpcMemHandle_t)]),
+              cudaIpcMemLazyEnablePeerAccess));
+          ipc_handles_.push_back(handle);
+          handle += offsets[j][i];
+          rd.ptrs[j] = handle;
+        } else {
+          rd.ptrs[j] = self_ptr;
+        }
+      }
+    }
+    CUDACHECK(cudaMemcpy(d_rank_data_base_, rank_data.data(),
+                         sizeof(RankData) * num_buffers,
+                         cudaMemcpyHostToDevice));
+    d_rank_data_base_ += num_buffers;
+    graph_unreg_buffers_.clear();
+  }
+
+  /**
+   * This is the result after careful grid search. Using 36 blocks give the best
+   * or close to the best runtime on the devices I tried: A100, A10, A30, T4,
+   * V100. You'll notice that NCCL kernels also only take a small amount of SMs.
+   * Not quite sure the underlying reason, but my guess is that too many SMs
+   * will cause contention on NVLink bus.
+   */
+  template <typename T>
+  void allreduce(cudaStream_t stream, T *input, T *output, int size,
+                 int threads = 512, int block_limit = 36) {
+    auto d = packed_t<T>::P::size;
+    if (size % d != 0)
+      throw std::runtime_error(
+          "custom allreduce currently requires input length to be multiple "
+          "of " +
+          std::to_string(d));
+
+    RankData *ptrs;
+    cudaStreamCaptureStatus status;
+    CUDACHECK(cudaStreamIsCapturing(stream, &status));
+    if (status == cudaStreamCaptureStatusActive) {
+      ptrs = d_rank_data_base_ + graph_unreg_buffers_.size();
+      graph_unreg_buffers_.push_back(input);
+    } else {
+      auto it = buffers_.find(input);
+      if (it == buffers_.end())
+        throw std::runtime_error(
+            "buffer address " +
+            std::to_string(reinterpret_cast<uint64_t>(input)) +
+            " is not registered!");
+      ptrs = it->second;
+    }
+
+    size /= d;
+    auto bytes = size * sizeof(typename packed_t<T>::P);
+    int blocks = std::min(block_limit, (size + threads - 1) / threads);
+#define KL(ngpus, name) \
+  name<T, ngpus>        \
+      <<<blocks, threads, 0, stream>>>(ptrs, sg_, meta_, output, rank_, size);
+#define REDUCE_CASE(ngpus)                            \
+  case ngpus: {                                       \
+    if (world_size_ == 2) {                           \
+      KL(ngpus, cross_device_reduce_1stage);          \
+    } else if (full_nvlink_) {                        \
+      if ((world_size_ <= 4 && bytes < 512 * 1024) || \
+          (world_size_ <= 8 && bytes < 256 * 1024)) { \
+        KL(ngpus, cross_device_reduce_1stage);        \
+      } else {                                        \
+        KL(ngpus, cross_device_reduce_2stage);        \
+      }                                               \
+    } else {                                          \
+      KL(ngpus, cross_device_reduce_half_butterfly);  \
+    }                                                 \
+    break;                                            \
+  }
+
+    switch (world_size_) {
+      REDUCE_CASE(2)
+      REDUCE_CASE(4)
+      REDUCE_CASE(6)
+      REDUCE_CASE(8)
+      default:
+        throw std::runtime_error(
+            "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
+            "gpus = " +
+            std::to_string(world_size_));
+    }
+#undef REDUCE_CASE
+#undef KL
+  }
+
+  ~CustomAllreduce() {
+    for (auto ptr : ipc_handles_) {
+      CUDACHECK(cudaIpcCloseMemHandle(ptr));
+    }
+  }
+};
+/**
+ * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
+ a template instantiation:
+ * template void CustomAllreduce::allreduce<half>(cudaStream_t, half *, half *,
+ int, int, int);
+*/
+}  // namespace vllm
diff --git a/csrc/custom_all_reduce_test.cu b/csrc/custom_all_reduce_test.cu
new file mode 100644
index 0000000000000..6b094e2fdc9ba
--- /dev/null
+++ b/csrc/custom_all_reduce_test.cu
@@ -0,0 +1,284 @@
+/**
+ * This is a standalone test for custom allreduce.
+ * To compile, make sure you have MPI and NCCL installed in your system.
+ * export MPI_HOME=XXX
+ * nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
+ * custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi
+ *
+ * Warning: this C++ test is not designed to be very readable and was used
+ * during the rapid prototyping process.
+ *
+ * To run:
+ * mpirun -np 8 ./custom_all_reduce_test
+ */
+#include <cuda.h>
+#include <curand_kernel.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <limits>
+#include <vector>
+
+#include "cuda_profiler_api.h"
+#include "custom_all_reduce.cuh"
+#include "mpi.h"
+#include "nccl.h"
+
+#define MPICHECK(cmd)                                                  \
+  do {                                                                 \
+    int e = cmd;                                                       \
+    if (e != MPI_SUCCESS) {                                            \
+      printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e); \
+      exit(EXIT_FAILURE);                                              \
+    }                                                                  \
+  } while (0)
+
+#define NCCLCHECK(cmd)                                              \
+  do {                                                              \
+    ncclResult_t r = cmd;                                           \
+    if (r != ncclSuccess) {                                         \
+      printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \
+             ncclGetErrorString(r));                                \
+      exit(EXIT_FAILURE);                                           \
+    }                                                               \
+  } while (0)
+
+__global__ void dummy_kernel() {
+  for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
+}
+
+template <typename T>
+__global__ void set_data(T *data, int size, int myRank) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    data[idx] = myRank * 0.11f;
+  }
+}
+
+template <typename T>
+__global__ void convert_data(const T *data1, const T *data2, double *fdata1,
+                             double *fdata2, int size) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    fdata1[idx] = data1[idx];
+    fdata2[idx] = data2[idx];
+  }
+}
+
+__global__ void init_rand(curandState_t *state, int size, int nRanks) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    for (int i = 0; i < nRanks; i++) {
+      curand_init(i + 1, idx, 0, &state[idx * nRanks + i]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void gen_data(curandState_t *state, T *data, double *ground_truth,
+                         int myRank, int nRanks, int size) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    double sum = 0.0;
+    for (int i = 0; i < nRanks; i++) {
+      double val = curand_uniform_double(&state[idx * nRanks + i]) * 4;
+      T hval = val;  // downcast first
+      sum += static_cast<double>(hval);
+      if (i == myRank) data[idx] = hval;
+    }
+    ground_truth[idx] = sum;
+  }
+}
+
+template <typename T>
+void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
+         int data_size) {
+  T *result;
+  cudaStream_t stream;
+  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+  CUDACHECK(cudaMalloc(&result, data_size * sizeof(T)));
+  CUDACHECK(cudaMemset(result, 0, data_size * sizeof(T)));
+
+  cudaIpcMemHandle_t self_data_handle;
+  cudaIpcMemHandle_t data_handles[8];
+  vllm::Metadata *buffer;
+  T *self_data_copy;
+  /**
+   * Allocate IPC buffer
+   *
+   * The first section is a temporary buffer for storing intermediate allreduce
+   * results, if a particular algorithm requires it. The second section is for
+   * the input to the allreduce. The actual API takes the input pointer as an
+   * argument (that is, they can and usually should be allocated separately).
+   * But since the input pointers and the temporary buffer all require IPC
+   * registration, they are allocated and registered together in the test for
+   * convenience.
+   */
+  CUDACHECK(
+      cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Metadata)));
+  CUDACHECK(cudaMemset(buffer, 0,
+                       2 * data_size * sizeof(T) + sizeof(vllm::Metadata)));
+  CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T)));
+  CUDACHECK(cudaIpcGetMemHandle(&self_data_handle, buffer));
+
+  MPICHECK(MPI_Allgather(&self_data_handle, sizeof(cudaIpcMemHandle_t),
+                         MPI_BYTE, data_handles, sizeof(cudaIpcMemHandle_t),
+                         MPI_BYTE, MPI_COMM_WORLD));
+
+  void *rank_data;
+  size_t rank_data_sz = 16 * 1024 * 1024;
+  CUDACHECK(cudaMalloc(&rank_data, rank_data_sz));
+  std::vector<int64_t> offsets(nRanks, 0);
+  vllm::CustomAllreduce fa(buffer, rank_data, rank_data_sz, data_handles,
+                           offsets, myRank);
+  auto *self_data =
+      reinterpret_cast<T *>(reinterpret_cast<char *>(buffer) +
+                            sizeof(vllm::Metadata) + data_size * sizeof(T));
+  // hack buffer registration
+  {
+    std::vector<std::string> handles;
+    handles.reserve(nRanks);
+    for (int i = 0; i < nRanks; i++) {
+      char *begin = (char *)&data_handles[i];
+      char *end = (char *)&data_handles[i + 1];
+      handles.emplace_back(begin, end);
+    }
+    std::vector<int64_t> offsets(
+        nRanks, sizeof(vllm::Metadata) + data_size * sizeof(T));
+    fa.register_buffer(handles, offsets, self_data);
+  }
+
+  double *ground_truth;
+  CUDACHECK(cudaMallocHost(&ground_truth, data_size * sizeof(double)));
+  curandState_t *states;
+  CUDACHECK(cudaMalloc(&states, sizeof(curandState_t) * nRanks * data_size));
+  init_rand<<<108, 1024, 0, stream>>>(states, data_size, nRanks);
+  gen_data<T><<<108, 1024, 0, stream>>>(states, self_data, ground_truth, myRank,
+                                        nRanks, data_size);
+  CUDACHECK(cudaMemcpyAsync(self_data_copy, self_data, data_size * sizeof(T),
+                            cudaMemcpyDeviceToDevice, stream));
+  cudaEvent_t start, stop;
+  CUDACHECK(cudaEventCreate(&start));
+  CUDACHECK(cudaEventCreate(&stop));
+
+  ncclDataType_t ncclDtype;
+  if (std::is_same<T, half>::value) {
+    ncclDtype = ncclFloat16;
+  } else if (std::is_same<T, nv_bfloat16>::value) {
+    ncclDtype = ncclBfloat16;
+  } else {
+    ncclDtype = ncclFloat;
+  }
+
+  dummy_kernel<<<1, 1, 0, stream>>>();
+  constexpr int warmup_iters = 5;
+  constexpr int num_iters = 25;
+  // warmup
+  for (int i = 0; i < warmup_iters; i++) {
+    NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum, comm,
+                            stream));
+  }
+  CUDACHECK(cudaEventRecord(start, stream));
+  for (int i = 0; i < num_iters; i++) {
+    NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum, comm,
+                            stream));
+  }
+  CUDACHECK(cudaEventRecord(stop, stream));
+  CUDACHECK(cudaStreamSynchronize(stream));
+  float allreduce_ms = 0;
+  cudaEventElapsedTime(&allreduce_ms, start, stop);
+
+  // if (myRank == 1) dummy_kernel<<<1, 1, 0, stream>>>();
+  // set_data<T><<<16, 1024, 0, stream>>>(self_data, data_size, myRank);
+
+  dummy_kernel<<<1, 1, 0, stream>>>();
+  // warm up
+  for (int i = 0; i < warmup_iters; i++) {
+    fa.allreduce<T>(stream, self_data, result, data_size, threads, block_limit);
+  }
+  CUDACHECK(cudaEventRecord(start, stream));
+  for (int i = 0; i < num_iters; i++) {
+    fa.allreduce<T>(stream, self_data, result, data_size, threads, block_limit);
+  }
+  CUDACHECK(cudaEventRecord(stop, stream));
+  CUDACHECK(cudaStreamSynchronize(stream));
+
+  float duration_ms = 0;
+  cudaEventElapsedTime(&duration_ms, start, stop);
+  if (myRank == 0)
+    printf(
+        "Rank %d done, nGPUs:%d, sz (kb): %d, %d, %d, my time:%.2fus, nccl "
+        "time:%.2fus\n",
+        myRank, nRanks, data_size * sizeof(T) / 1024, threads, block_limit,
+        duration_ms * 1e3 / num_iters, allreduce_ms * 1e3 / num_iters);
+
+  // And wait for all the queued up work to complete
+  CUDACHECK(cudaStreamSynchronize(stream));
+
+  NCCLCHECK(ncclAllReduce(self_data_copy, self_data, data_size, ncclDtype,
+                          ncclSum, comm, stream));
+
+  double *nccl_result, *my_result;
+  CUDACHECK(cudaMallocHost(&nccl_result, data_size * sizeof(double)));
+  CUDACHECK(cudaMallocHost(&my_result, data_size * sizeof(double)));
+
+  convert_data<T><<<108, 1024, 0, stream>>>(self_data, result, nccl_result,
+                                            my_result, data_size);
+  CUDACHECK(cudaStreamSynchronize(stream));
+
+  for (unsigned long j = 0; j < data_size; j++) {
+    auto diff = abs(nccl_result[j] - my_result[j]);
+    if (diff >= 1e-2) {
+      printf("Rank %d: Verification mismatch at %lld: %f != (my) %f, gt=%f\n",
+             myRank, j, nccl_result[j], my_result[j], ground_truth[j]);
+      break;
+    }
+  }
+
+  long double nccl_diffs = 0.0;
+  long double my_diffs = 0.0;
+  for (int j = 0; j < data_size; j++) {
+    nccl_diffs += abs(nccl_result[j] - ground_truth[j]);
+    my_diffs += abs(my_result[j] - ground_truth[j]);
+  }
+  if (myRank == 0)
+    std::cout << "average abs diffs: nccl: " << nccl_diffs / data_size
+              << " me: " << my_diffs / data_size << std::endl;
+
+  CUDACHECK(cudaFree(result));
+  CUDACHECK(cudaFree(self_data_copy));
+  CUDACHECK(cudaFree(rank_data));
+  CUDACHECK(cudaFree(buffer));
+  CUDACHECK(cudaFree(states));
+  CUDACHECK(cudaFreeHost(ground_truth));
+  CUDACHECK(cudaFreeHost(nccl_result));
+  CUDACHECK(cudaFreeHost(my_result));
+  CUDACHECK(cudaStreamDestroy(stream));
+}
+
+int main(int argc, char **argv) {
+  int nRanks, myRank;
+  MPICHECK(MPI_Init(&argc, &argv));
+  MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank));
+  MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks));
+  CUDACHECK(cudaSetDevice(myRank));
+  ncclUniqueId id;
+  ncclComm_t comm;
+  if (myRank == 0) ncclGetUniqueId(&id);
+  MPICHECK(MPI_Bcast(static_cast<void *>(&id), sizeof(id), MPI_BYTE, 0,
+                     MPI_COMM_WORLD));
+  NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank));
+
+  cudaProfilerStart();
+  // for (int threads : {256, 512}) {
+  //   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
+  //     run<half>(myRank, nRanks, comm, threads, block_limit, 4096 * 1024);
+  //   }
+  // }
+  for (int sz = 512; sz <= (32 << 20); sz *= 2) {
+    run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 50);
+  }
+
+  cudaProfilerStop();
+  return EXIT_SUCCESS;
+}
diff --git a/csrc/ops.h b/csrc/ops.h
index d49619644b182..6e996fd0d577b 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -97,3 +97,25 @@ torch::Tensor gptq_gemm(
 void gptq_shuffle(
   torch::Tensor q_weight,
   torch::Tensor q_perm);
+
+
+#ifndef USE_ROCM
+using fptr_t = uint64_t;
+fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data,
+                    const std::vector<std::string> &handles,
+                    const std::vector<int64_t> &offsets, int rank,
+                    bool full_nvlink);
+bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size,
+                      bool full_nvlink);
+void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out);
+void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &reg_buffer,
+                      torch::Tensor &out);
+void dispose(fptr_t _fa);
+int meta_size();
+void register_buffer(fptr_t _fa, torch::Tensor &t,
+                     const std::vector<std::string> &handles,
+                     const std::vector<int64_t> &offsets);
+std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(fptr_t _fa);
+void register_graph_buffers(fptr_t _fa, const std::vector<std::string> &handles,
+                            const std::vector<std::vector<int64_t>> &offsets);
+#endif
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index 88af7eac8a28f..f94efadfa101a 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -88,4 +88,20 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     &get_max_shared_memory_per_block_device_attribute,
     "Gets the maximum shared memory per block device attribute.");
 
+#ifndef USE_ROCM
+  // Custom all-reduce kernels
+  pybind11::module custom_ar = m.def_submodule("custom_ar", "custom allreduce");
+  custom_ar.def("init_custom_ar", &init_custom_ar, "init_custom_ar");
+  custom_ar.def("should_custom_ar", &should_custom_ar, "should_custom_ar");
+  custom_ar.def("all_reduce_reg", &all_reduce_reg, "all_reduce_reg");
+  custom_ar.def("all_reduce_unreg", &all_reduce_unreg, "all_reduce_unreg");
+  custom_ar.def("dispose", &dispose, "dispose");
+  custom_ar.def("meta_size", &meta_size, "meta_size");
+  custom_ar.def("register_buffer", &register_buffer, "register_buffer");
+  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta,
+                "get_graph_buffer_ipc_meta");
+  custom_ar.def("register_graph_buffers", &register_graph_buffers,
+                "register_graph_buffers");
+#endif
+
 }
diff --git a/requirements.txt b/requirements.txt
index 299bad38fbf8a..19871bdcc9548 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,4 @@ fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
 aioprometheus[starlette]
+pynvml == 11.5.0
diff --git a/setup.py b/setup.py
index 88fa495205659..2f6242690a263 100644
--- a/setup.py
+++ b/setup.py
@@ -51,8 +51,8 @@ def _is_cuda() -> bool:
             "Cannot find ROCM_HOME. ROCm must be available to build the package."
         )
     NVCC_FLAGS += ["-DUSE_ROCM"]
-    NVCC_FLAGS += [f"-U__HIP_NO_HALF_CONVERSIONS__"]
-    NVCC_FLAGS += [f"-U__HIP_NO_HALF_OPERATORS__"]
+    NVCC_FLAGS += ["-U__HIP_NO_HALF_CONVERSIONS__"]
+    NVCC_FLAGS += ["-U__HIP_NO_HALF_OPERATORS__"]
 
 if _is_cuda() and CUDA_HOME is None:
     raise RuntimeError(
@@ -307,6 +307,7 @@ def get_torch_arch_list() -> Set[str]:
 
 if _is_cuda():
     vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
+    vllm_extension_sources.append("csrc/custom_all_reduce.cu")
 
 if not _is_neuron():
     vllm_extension = CUDAExtension(
@@ -316,6 +317,7 @@ def get_torch_arch_list() -> Set[str]:
             "cxx": CXX_FLAGS,
             "nvcc": NVCC_FLAGS,
         },
+        libraries=["cuda"] if _is_cuda() else [],
     )
     ext_modules.append(vllm_extension)
 
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index b12e563fd9d44..9474cb21599d4 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -6,25 +6,13 @@
 import torch
 import ray
 
-from vllm.config import ParallelConfig
-from vllm.utils import get_open_port
 from vllm.model_executor.parallel_utils.communication_op import (
     tensor_model_parallel_all_reduce,
     tensor_model_parallel_all_gather,
     broadcast_tensor_dict,
 )
-from vllm.worker.worker import _init_distributed_environment
-
-
-def init_test_distributed_environment(pipeline_parallel_size: int,
-                                      tensor_parallel_size: int, rank: int,
-                                      distributed_init_port: str):
-    parallel_config = ParallelConfig(pipeline_parallel_size,
-                                     tensor_parallel_size,
-                                     worker_use_ray=True)
-    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
-    _init_distributed_environment(parallel_config, rank,
-                                  distributed_init_method)
+from vllm.test_utils import (init_test_distributed_environment,
+                             multi_process_tensor_parallel)
 
 
 @ray.remote(num_gpus=1, max_calls=1)
@@ -101,16 +89,4 @@ def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int,
     broadcast_tensor_dict_test_worker
 ])
 def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
-    # Using ray helps debugging the error when it failed
-    # as compared to multiprocessing.
-    ray.init()
-
-    distributed_init_port = get_open_port()
-    refs = []
-    for rank in range(tensor_parallel_size):
-        refs.append(
-            test_target.remote(tensor_parallel_size, rank,
-                               distributed_init_port))
-    ray.get(refs)
-
-    ray.shutdown()
+    multi_process_tensor_parallel(tensor_parallel_size, test_target)
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
new file mode 100644
index 0000000000000..ed4965593c2f0
--- /dev/null
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -0,0 +1,85 @@
+import random
+
+import os
+import pytest
+import ray
+import torch
+import torch.distributed as dist
+
+from vllm.model_executor.parallel_utils import custom_all_reduce as custom_ar
+from vllm.model_executor.parallel_utils.communication_op import (
+    tensor_model_parallel_all_reduce)
+from vllm.test_utils import (init_test_distributed_environment,
+                             multi_process_tensor_parallel)
+
+random.seed(42)
+test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
+for i, v in enumerate(test_sizes):
+    test_sizes[i] -= v % 8
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def graph_allreduce(world_size, rank, distributed_init_port):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(1, world_size, rank,
+                                      distributed_init_port)
+
+    custom_ar.init_custom_ar()
+    for sz in test_sizes:
+        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            with custom_ar.capture():
+                # use integers so result matches NCCL exactly
+                inp1 = torch.randint(1,
+                                     16, (sz, ),
+                                     dtype=dtype,
+                                     device=torch.cuda.current_device())
+                inp2 = torch.randint(1,
+                                     16, (sz, ),
+                                     dtype=dtype,
+                                     device=torch.cuda.current_device())
+                torch.cuda.synchronize()
+                graph = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(graph):
+                    out1 = tensor_model_parallel_all_reduce(inp1)
+                    # the input buffer is immediately modified to test
+                    # synchronization
+                    dist.all_reduce(inp1)
+                    out2 = tensor_model_parallel_all_reduce(inp2)
+                    dist.all_reduce(inp2)
+            graph.replay()
+            assert torch.allclose(out1, inp1)
+            assert torch.allclose(out2, inp2)
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def eager_allreduce(world_size, rank, distributed_init_port):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(1, world_size, rank,
+                                      distributed_init_port)
+
+    sz = 1024
+    custom_ar.init_custom_ar()
+    fa = custom_ar.get_handle()
+    inp = torch.ones(sz, dtype=torch.float32, device=device)
+    out = fa.all_reduce_unreg(inp)
+    assert torch.allclose(out, inp * world_size)
+
+    inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
+    out = fa.all_reduce_unreg(inp)
+    assert torch.allclose(out, inp * world_size)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("tensor_parallel_size", [2])
+@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
+def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
+    multi_process_tensor_parallel(tensor_parallel_size, test_target)
+
+
+if __name__ == "__main__":
+    multi_process_tensor_parallel(2, graph_allreduce)
diff --git a/vllm/config.py b/vllm/config.py
index 4f1ce87cb615b..da97eaa77fd35 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -328,6 +328,8 @@ class ParallelConfig:
         worker_use_ray: Whether to use Ray for model workers. Will be set to
             True if either pipeline_parallel_size or tensor_parallel_size is
             greater than 1.
+        disable_custom_all_reduce: Disable the custom all-reduce kernel and
+            fall back to NCCL.
     """
 
     def __init__(
@@ -336,11 +338,13 @@ def __init__(
         tensor_parallel_size: int,
         worker_use_ray: bool,
         max_parallel_loading_workers: Optional[int] = None,
+        disable_custom_all_reduce: bool = False,
     ) -> None:
         self.pipeline_parallel_size = pipeline_parallel_size
         self.tensor_parallel_size = tensor_parallel_size
         self.worker_use_ray = worker_use_ray
         self.max_parallel_loading_workers = max_parallel_loading_workers
+        self.disable_custom_all_reduce = disable_custom_all_reduce
 
         self.world_size = pipeline_parallel_size * tensor_parallel_size
         if self.world_size > 1:
@@ -351,6 +355,16 @@ def _verify_args(self) -> None:
         if self.pipeline_parallel_size > 1:
             raise NotImplementedError(
                 "Pipeline parallelism is not supported yet.")
+        if is_hip():
+            self.disable_custom_all_reduce = True
+            logger.info(
+                "Disabled the custom all-reduce kernel because it is not "
+                "supported on AMD GPUs.")
+        elif self.pipeline_parallel_size > 1:
+            self.disable_custom_all_reduce = True
+            logger.info(
+                "Disabled the custom all-reduce kernel because it is not "
+                "supported with pipeline parallelism.")
 
 
 class SchedulerConfig:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 090fa95bcac02..968362c468deb 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -35,6 +35,7 @@ class EngineArgs:
     quantization: Optional[str] = None
     enforce_eager: bool = False
     max_context_len_to_capture: int = 8192
+    disable_custom_all_reduce: bool = False
     enable_lora: bool = False
     max_loras: int = 1
     max_lora_rank: int = 16
@@ -208,6 +209,10 @@ def add_cli_args(
                             help='maximum context length covered by CUDA '
                             'graphs. When a sequence has context length '
                             'larger than this, we fall back to eager mode.')
+        parser.add_argument('--disable-custom-all-reduce',
+                            action='store_true',
+                            default=EngineArgs.disable_custom_all_reduce,
+                            help='See ParallelConfig')
         # LoRA related configs
         parser.add_argument('--enable-lora',
                             action='store_true',
@@ -269,7 +274,8 @@ def create_engine_configs(
         parallel_config = ParallelConfig(self.pipeline_parallel_size,
                                          self.tensor_parallel_size,
                                          self.worker_use_ray,
-                                         self.max_parallel_loading_workers)
+                                         self.max_parallel_loading_workers,
+                                         self.disable_custom_all_reduce)
         scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
                                            self.max_num_seqs,
                                            model_config.max_model_len,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 0dedc232292dd..87752eea02811 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -82,6 +82,7 @@ def __init__(
             f"download_dir={model_config.download_dir!r}, "
             f"load_format={model_config.load_format}, "
             f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
+            f"disable_custom_all_reduce={parallel_config.disable_custom_all_reduce}, "
             f"quantization={model_config.quantization}, "
             f"enforce_eager={model_config.enforce_eager}, "
             f"seed={model_config.seed})")
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index aab0c9615f411..614e6fa520c8c 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -64,6 +64,7 @@ class LLM:
         max_context_len_to_capture: Maximum context len covered by CUDA graphs.
             When a sequence has context length larger than this, we fall back
             to eager mode.
+        disable_custom_all_reduce: See ParallelConfig
     """
 
     def __init__(
@@ -82,6 +83,7 @@ def __init__(
         swap_space: int = 4,
         enforce_eager: bool = False,
         max_context_len_to_capture: int = 8192,
+        disable_custom_all_reduce: bool = False,
         **kwargs,
     ) -> None:
         if "disable_log_stats" not in kwargs:
@@ -101,6 +103,7 @@ def __init__(
             swap_space=swap_space,
             enforce_eager=enforce_eager,
             max_context_len_to_capture=max_context_len_to_capture,
+            disable_custom_all_reduce=disable_custom_all_reduce,
             **kwargs,
         )
         self.llm_engine = LLMEngine.from_engine_args(engine_args)
diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py
index fff6920be72b0..65671994f3309 100644
--- a/vllm/model_executor/parallel_utils/communication_op.py
+++ b/vllm/model_executor/parallel_utils/communication_op.py
@@ -10,17 +10,27 @@
     get_tensor_model_parallel_world_size,
     get_tensor_model_parallel_group,
 )
+from vllm.model_executor.parallel_utils.custom_all_reduce import custom_all_reduce
 
 
 def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
     """All-reduce the input tensor across model parallel group.
 
-    NOTE: This operation is applied in-place on the input tensor.
+    NOTE: This operation will be applied in-place on the input tensor if
+    disable_custom_all_reduce is set to True. Otherwise, this operation may or
+    may not be applied in place depending on whether custom all reduce is
+    invoked for a particular tensor, which further depends on the tensor size
+    and GPU topology.
+
+    TLDR: always assume this function modifies its input, but use the return
+    value as the output. 
     """
     # Bypass the function if we are using only 1 GPU.
     if get_tensor_model_parallel_world_size() == 1:
         return input_
-    # All-reduce.
+    out = custom_all_reduce(input_)
+    if out is not None:
+        return out
     torch.distributed.all_reduce(input_,
                                  group=get_tensor_model_parallel_group())
     return input_
diff --git a/vllm/model_executor/parallel_utils/custom_all_reduce.py b/vllm/model_executor/parallel_utils/custom_all_reduce.py
new file mode 100644
index 0000000000000..5b88649cc2129
--- /dev/null
+++ b/vllm/model_executor/parallel_utils/custom_all_reduce.py
@@ -0,0 +1,223 @@
+from contextlib import contextmanager
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+from vllm.logger import init_logger
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank)
+
+try:
+    from vllm._C import custom_ar
+    import pynvml
+except ImportError:
+    # For AMD GPUs
+    custom_ar = None
+    pynvml = None
+
+logger = init_logger(__name__)
+
+_CA_HANDLE = None
+_IS_CAPTURING = False
+_SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
+
+
+def init_custom_ar() -> None:
+    global _CA_HANDLE
+    if _CA_HANDLE is not None:
+        return
+    rank = get_tensor_model_parallel_rank()
+    world_size = get_tensor_model_parallel_world_size()
+    if world_size not in _SUPPORTED_WORLD_SIZES:
+        logger.warn(
+            "Custom allreduce is disabled due to an unsupported world size: "
+            "%d. Supported world sizes: %s. To slience this warning, specify"
+            "disable_custom_all_reduce=True explicitly.", world_size,
+            str(_SUPPORTED_WORLD_SIZES))
+        return
+    if not _can_p2p(rank, world_size):
+        logger.warn(
+            "Custom allreduce is disabled because your platform lacks GPU P2P"
+            " capability. To slience this warning, specify"
+            "disable_custom_all_reduce=True explicitly.")
+        return
+    _CA_HANDLE = CustomAllreduce(rank, world_size)
+
+
+def begin_capture() -> None:
+    global _IS_CAPTURING
+    _IS_CAPTURING = True
+
+
+def end_capture() -> None:
+    global _IS_CAPTURING
+    _IS_CAPTURING = False
+
+
+def is_capturing() -> bool:
+    return _IS_CAPTURING and _CA_HANDLE is not None
+
+
+def get_handle() -> Optional["CustomAllreduce"]:
+    return _CA_HANDLE
+
+
+@contextmanager
+def capture():
+    try:
+        begin_capture()
+        yield
+    finally:
+        end_capture()
+        handle = get_handle()
+        if handle is not None:
+            handle.register_graph_buffers()
+
+
+def custom_all_reduce(input: torch.Tensor) -> Optional[torch.Tensor]:
+    ca_handle = get_handle()
+    # when custom allreduce is disabled, this will be None
+    if ca_handle is None:
+        return
+    if is_capturing():
+        if torch.cuda.is_current_stream_capturing():
+            if ca_handle.should_custom_ar(input):
+                return ca_handle.all_reduce_reg(input)
+        else:
+            if ca_handle.should_custom_ar(input):
+                # if warm up, mimic the allocation pattern
+                # since custom allreduce is out-of-place
+                return torch.empty_like(input)
+    else:
+        # note: outside of cuda graph context,
+        # custom allreduce incurs a cost of cudaMemcpy, which should
+        # be small(<=1% of overall latency) compared to the performance
+        # gains of using custom kernels
+        if ca_handle.should_custom_ar(input):
+            return ca_handle.all_reduce_unreg(input)
+
+
+@contextmanager
+def _nvml():
+    try:
+        pynvml.nvmlInit()
+        yield
+    finally:
+        pynvml.nvmlShutdown()
+
+
+# query if the set of gpus are fully connected by nvlink (1 hop)
+@_nvml()
+def _is_full_nvlink(rank, world_size):
+    handle = pynvml.nvmlDeviceGetHandleByIndex(rank)
+    for i in range(world_size):
+        if i != rank:
+            try:
+                link_state = pynvml.nvmlDeviceGetNvLinkState(handle, i)
+                if not link_state:
+                    return False
+            except pynvml.NVMLError as error:
+                logger.info(
+                    f"NVLink detection failed with message \"{str(error)}\". "
+                    "This is normal if your machine has no NVLink equipped")
+                return False
+    return True
+
+
+def _can_p2p(rank: int, world_size: int) -> bool:
+    for i in range(world_size):
+        if i == rank:
+            continue
+        if not torch.cuda.can_device_access_peer(rank, i):
+            return False
+    return True
+
+
+class CustomAllreduce:
+
+    # max_size: max supported allreduce size
+    def __init__(self, rank, world_size, max_size=8192 * 1024) -> None:
+        # buffers memory are owned by this Python class and passed to C++
+        # meta data composes of two parts: meta data for synchronization
+        # (256 bytes) and a temporary buffer for storing intermediate
+        # allreduce results.
+        self.meta = torch.zeros(custom_ar.meta_size() + max_size,
+                                dtype=torch.uint8,
+                                device="cuda")
+        # This is a pre-registered IPC buffer. In eager mode, input tensors
+        # are first copied into this buffer before allreduce is performed
+        self.buffer = torch.empty(max_size, dtype=torch.uint8, device="cuda")
+        # This is a buffer for storing the tuples of pointers pointing to
+        # IPC buffers from all ranks. Each registered tuple has size of
+        # 8*world_size bytes where world_size is at most 8. Allocating 8MB
+        # is enough for 131072 such tuples. The largest model I've seen only
+        # needs less than 10000 of registered tuples.
+        self.rank_data = torch.empty(8 * 1024 * 1024,
+                                     dtype=torch.uint8,
+                                     device="cuda")
+        self.max_size = max_size
+        self.world_size = world_size
+        handles, offsets = self._get_ipc_meta(self.meta)
+        self.full_nvlink = _is_full_nvlink(rank, world_size)
+        self._ptr = custom_ar.init_custom_ar(self.meta, self.rank_data,
+                                             handles, offsets, rank,
+                                             self.full_nvlink)
+        self.fast_cond = self.full_nvlink or world_size <= 2
+        self.register_buffer(self.buffer)
+
+    def _get_ipc_meta(self, inp: torch.Tensor):
+        data = inp.untyped_storage()._share_cuda_()
+        shard_data = (
+            data[1],  # ipc handle to base ptr
+            data[3],  # offset of base ptr
+        )
+        return self._gather_ipc_meta(shard_data)
+
+    def _gather_ipc_meta(self, shard_data):
+        all_data = [None] * self.world_size
+        dist.all_gather_object(all_data, shard_data)
+
+        handles = []
+        offsets = []
+        for i in range(len(all_data)):
+            handles.append(all_data[i][0])
+            offsets.append(all_data[i][1])
+        return handles, offsets
+
+    def register_buffer(self, inp: torch.Tensor):
+        handles, offsets = self._get_ipc_meta(inp)
+        custom_ar.register_buffer(self._ptr, inp, handles, offsets)
+
+    def register_graph_buffers(self):
+        handle, offset = custom_ar.get_graph_buffer_ipc_meta(self._ptr)
+        handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
+        logger.info("Registering %d cuda graph addresses", len(offset))
+        custom_ar.register_graph_buffers(self._ptr, handles, offsets)
+
+    def should_custom_ar(self, inp: torch.Tensor):
+        return custom_ar.should_custom_ar(inp, self.max_size, self.world_size,
+                                          self.full_nvlink)
+
+    # all reduce, assuming inp tensor is IPC registered with register_buffer,
+    # or, in the context of cuda graphs, register_graph_buffers
+    def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
+        if out is None:
+            out = torch.empty_like(inp)
+        custom_ar.all_reduce_reg(self._ptr, inp, out)
+        return out
+
+    # all reduce, assuming inp tensor is NOT IPC registered
+    def all_reduce_unreg(self, inp: torch.Tensor, out: torch.Tensor = None):
+        if out is None:
+            out = torch.empty_like(inp)
+        custom_ar.all_reduce_unreg(self._ptr, inp, self.buffer, out)
+        return out
+
+    def close(self):
+        if self._ptr:
+            custom_ar.dispose(self._ptr)
+            self._ptr = 0
+
+    def __del__(self):
+        self.close()
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
new file mode 100644
index 0000000000000..4f74c05038e70
--- /dev/null
+++ b/vllm/test_utils.py
@@ -0,0 +1,38 @@
+import ray
+
+from vllm.config import ParallelConfig
+from vllm.utils import get_open_port
+from vllm.worker.worker import init_distributed_environment
+
+
+def init_test_distributed_environment(
+    pipeline_parallel_size: int,
+    tensor_parallel_size: int,
+    rank: int,
+    distributed_init_port: str,
+) -> None:
+    parallel_config = ParallelConfig(pipeline_parallel_size,
+                                     tensor_parallel_size,
+                                     worker_use_ray=True)
+    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
+    init_distributed_environment(parallel_config, rank,
+                                 distributed_init_method)
+
+
+def multi_process_tensor_parallel(
+    tensor_parallel_size: int,
+    test_target,
+) -> None:
+    # Using ray helps debugging the error when it failed
+    # as compared to multiprocessing.
+    ray.init()
+
+    distributed_init_port = get_open_port()
+    refs = []
+    for rank in range(tensor_parallel_size):
+        refs.append(
+            test_target.remote(tensor_parallel_size, rank,
+                               distributed_init_port))
+    ray.get(refs)
+
+    ray.shutdown()
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 985115613e044..60f5b71d35615 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -10,6 +10,7 @@
 from vllm.model_executor import get_model, InputMetadata, SamplingMetadata
 from vllm.model_executor.parallel_utils.communication_op import (
     broadcast_tensor_dict)
+from vllm.model_executor.parallel_utils import custom_all_reduce
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
@@ -651,37 +652,38 @@ def capture_model(self, kv_caches: List[KVCache]) -> None:
 
         # NOTE: Capturing the largest batch size first may help reduce the
         # memory usage of CUDA graph.
-        for batch_size in reversed(batch_size_capture_list):
-            # Create dummy input_metadata.
-            input_metadata = InputMetadata(
-                is_prompt=False,
-                slot_mapping=slot_mapping[:batch_size],
-                prompt_lens=None,
-                max_seq_len=None,
-                start_loc=None,
-                max_context_len=self.max_context_len_to_capture,
-                context_lens=context_lens[:batch_size],
-                block_tables=block_tables[:batch_size],
-                use_cuda_graph=True,
-            )
-
-            if self.lora_config:
-                lora_mapping = LoRAMapping(
-                    [0] * batch_size,
-                    [0] * batch_size,
+        with custom_all_reduce.capture():
+            for batch_size in reversed(batch_size_capture_list):
+                # Create dummy input_metadata.
+                input_metadata = InputMetadata(
+                    is_prompt=False,
+                    slot_mapping=slot_mapping[:batch_size],
+                    prompt_lens=None,
+                    max_seq_len=None,
+                    start_loc=None,
+                    max_context_len=self.max_context_len_to_capture,
+                    context_lens=context_lens[:batch_size],
+                    block_tables=block_tables[:batch_size],
+                    use_cuda_graph=True,
                 )
-                self.set_active_loras(set(), lora_mapping)
 
-            graph_runner = CUDAGraphRunner(self.model)
-            graph_runner.capture(
-                input_tokens[:batch_size],
-                input_positions[:batch_size],
-                kv_caches,
-                input_metadata,
-                memory_pool=self.graph_memory_pool,
-            )
-            self.graph_memory_pool = graph_runner.graph.pool()
-            self.graph_runners[batch_size] = graph_runner
+                if self.lora_config:
+                    lora_mapping = LoRAMapping(
+                        [0] * batch_size,
+                        [0] * batch_size,
+                    )
+                    self.set_active_loras(set(), lora_mapping)
+
+                graph_runner = CUDAGraphRunner(self.model)
+                graph_runner.capture(
+                    input_tokens[:batch_size],
+                    input_positions[:batch_size],
+                    kv_caches,
+                    input_metadata,
+                    memory_pool=self.graph_memory_pool,
+                )
+                self.graph_memory_pool = graph_runner.graph.pool()
+                self.graph_runners[batch_size] = graph_runner
 
         end_time = time.perf_counter()
         elapsed_time = end_time - start_time
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 845283586e147..f1dad64b2b27a 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -11,6 +11,7 @@
 from vllm.model_executor import set_random_seed
 from vllm.model_executor.parallel_utils.communication_op import (
     broadcast_tensor_dict)
+from vllm.model_executor.parallel_utils.custom_all_reduce import init_custom_ar
 from vllm.model_executor.parallel_utils.parallel_state import (
     ensure_model_parallel_initialized)
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
@@ -78,9 +79,10 @@ def init_model(self) -> None:
         _check_if_gpu_supports_dtype(self.model_config.dtype)
 
         # Initialize the distributed environment.
-        _init_distributed_environment(self.parallel_config, self.rank,
-                                      self.distributed_init_method)
-
+        init_distributed_environment(self.parallel_config, self.rank,
+                                     self.distributed_init_method)
+        if not self.parallel_config.disable_custom_all_reduce:
+            init_custom_ar()
         # Initialize the model.
         set_random_seed(self.model_config.seed)
 
@@ -219,7 +221,7 @@ def list_loras(self) -> Set[int]:
         return self.model_runner.list_loras()
 
 
-def _init_distributed_environment(
+def init_distributed_environment(
     parallel_config: ParallelConfig,
     rank: int,
     distributed_init_method: Optional[str] = None,

From 5f036d2bcc5244ca431212167c94700e5ae7a8e0 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 27 Jan 2024 15:43:40 -0800
Subject: [PATCH 007/112] [Minor] Fix warning on Ray dependencies (#2630)

---
 vllm/engine/ray_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py
index 1cb5fcda344f1..afbc33ed19a0c 100644
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
@@ -43,7 +43,7 @@ def set_cuda_visible_devices(self, device_ids) -> None:
 except ImportError as e:
     logger.warning(f"Failed to import Ray with {e!r}. "
                    "For distributed inference, please install Ray with "
-                   "`pip install ray pandas pyarrow`.")
+                   "`pip install ray`.")
     ray = None
     RayWorkerVllm = None
 

From f8ecb84c0283a7f1ba02ee732c9f044f8f9d36ee Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 27 Jan 2024 17:46:56 -0800
Subject: [PATCH 008/112] Speed up Punica compilation (#2632)

---
 .buildkite/test-template.j2             |  2 +-
 csrc/punica/bgmv/bgmv_all.cu            | 21 -------------------
 csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu |  4 ++++
 csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu |  4 ++++
 csrc/punica/bgmv/generator.py           | 27 +++++++++++++++++++++++++
 21 files changed, 100 insertions(+), 22 deletions(-)
 delete mode 100644 csrc/punica/bgmv/bgmv_all.cu
 create mode 100644 csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu
 create mode 100644 csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu
 create mode 100644 csrc/punica/bgmv/generator.py

diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index b355112935392..7c709b6097fd4 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -5,7 +5,7 @@
 steps:
   - label: ":docker: build image"
     commands:
-      - "docker build --tag {{ docker_image }} --target test --progress plain ."
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
       - "docker push {{ docker_image }}"
     env:
       DOCKER_BUILDKIT: "1"
diff --git a/csrc/punica/bgmv/bgmv_all.cu b/csrc/punica/bgmv/bgmv_all.cu
deleted file mode 100644
index 2502a67e3c813..0000000000000
--- a/csrc/punica/bgmv/bgmv_all.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_bfloat16)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_half)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_half)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_half)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_bfloat16)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_bfloat16)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_bfloat16)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_bfloat16)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_half)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_bfloat16)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_bfloat16)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_half)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_half)
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
new file mode 100644
index 0000000000000..c642e94925fe5
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu b/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu
new file mode 100644
index 0000000000000..e8202dff561d9
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu
new file mode 100644
index 0000000000000..3e7cf31dead0f
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu
new file mode 100644
index 0000000000000..68277fa6b7d56
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
new file mode 100644
index 0000000000000..0607cebfeac40
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu b/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu
new file mode 100644
index 0000000000000..3b7531b8fbcfc
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu
new file mode 100644
index 0000000000000..b3b74aa3ec904
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu
new file mode 100644
index 0000000000000..3cc87f5df76a1
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu b/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu
new file mode 100644
index 0000000000000..9eda98bd8ddcf
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
new file mode 100644
index 0000000000000..f1db6df5f7338
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu b/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu
new file mode 100644
index 0000000000000..060f9ebb8c2b1
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
new file mode 100644
index 0000000000000..c01ddd009d74e
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
new file mode 100644
index 0000000000000..f45183ffd3486
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu b/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu
new file mode 100644
index 0000000000000..b37e44570bf40
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu b/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu
new file mode 100644
index 0000000000000..06718cbb0a3e9
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
new file mode 100644
index 0000000000000..4097743488087
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu b/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu
new file mode 100644
index 0000000000000..41fb0e45ef4e6
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu b/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu
new file mode 100644
index 0000000000000..50b7ead9fcefd
--- /dev/null
+++ b/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu
@@ -0,0 +1,4 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_half)
diff --git a/csrc/punica/bgmv/generator.py b/csrc/punica/bgmv/generator.py
new file mode 100644
index 0000000000000..66de56d74f3e7
--- /dev/null
+++ b/csrc/punica/bgmv/generator.py
@@ -0,0 +1,27 @@
+DTYPES = ["fp16", "bf16", "fp32"]
+DTYPE_MAP = {
+    "fp16": "nv_half",
+    "bf16": "nv_bfloat16",
+    "fp32": "float",
+}
+
+TEMPLATE = """
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype})
+""".lstrip()
+
+for input_dtype in DTYPES:
+    for output_dtype in DTYPES:
+        for weight_dtype in DTYPES:
+            if weight_dtype == "fp32":
+                # FP32 weights are not supported.
+                continue
+            kernel_definition = TEMPLATE.format(
+                input_dtype=DTYPE_MAP[input_dtype],
+                output_dtype=DTYPE_MAP[output_dtype],
+                weight_dtype=DTYPE_MAP[weight_dtype])
+            filename = f"bgmv_{input_dtype}_{output_dtype}_{weight_dtype}.cu"
+            with open(filename, "w") as f:
+                f.write(kernel_definition)

From 89be30fa7d51035cee96d1573ffbe8b8ba6db878 Mon Sep 17 00:00:00 2001
From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Date: Sat, 27 Jan 2024 23:28:37 -0800
Subject: [PATCH 009/112] Small async_llm_engine refactor (#2618)

---
 vllm/engine/async_llm_engine.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index c7591945be243..2885aab9f3161 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -53,7 +53,7 @@ def put(self, item: RequestOutput) -> None:
         self._queue.put_nowait(item)
 
     def finish(self) -> None:
-        self._queue.put_nowait(StopIteration)
+        self._queue.put_nowait(StopAsyncIteration())
         self._finished = True
 
     @property
@@ -65,9 +65,7 @@ def __aiter__(self):
 
     async def __anext__(self) -> RequestOutput:
         result = await self._queue.get()
-        if result is StopIteration:
-            raise StopAsyncIteration
-        elif isinstance(result, Exception):
+        if isinstance(result, Exception):
             raise result
         return result
 

From 7d648418b8b1aadb90489ef18cff1763ffc82ed5 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sun, 28 Jan 2024 14:27:22 -0800
Subject: [PATCH 010/112] Update Ray version requirements (#2636)

---
 requirements-rocm.txt | 2 +-
 requirements.txt      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index a846f929ef226..7b42ee067310b 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -2,7 +2,7 @@ ninja  # For faster builds.
 typing-extensions>=4.8.0
 starlette
 psutil
-ray >= 2.5.1
+ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 tokenizers>=0.15.0
diff --git a/requirements.txt b/requirements.txt
index 19871bdcc9548..2bf527ccc3a77 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 ninja  # For faster builds.
 psutil
-ray >= 2.5.1
+ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 torch == 2.1.2

From 9090bf02e74334a8020b454814e0d00fa780fd79 Mon Sep 17 00:00:00 2001
From: zhaoyang-star <zhaoyangstar@foxmail.com>
Date: Mon, 29 Jan 2024 08:43:54 +0800
Subject: [PATCH 011/112] Support FP8-E5M2 KV Cache (#2279)

Co-authored-by: zhaoyang <zhao.yang16@zte.com.cn>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
---
 benchmarks/benchmark_latency.py               |   8 +
 benchmarks/benchmark_throughput.py            |  12 +-
 .../kernels/benchmark_paged_attention.py      |  33 ++-
 csrc/attention/attention_dtypes.h             |   1 +
 csrc/attention/attention_kernels.cu           | 259 ++++++++++------
 csrc/attention/dtype_fp8_e5m2.cuh             |  35 +++
 csrc/cache.h                                  |   8 +-
 csrc/cache_kernels.cu                         | 135 +++++++--
 csrc/dispatch_utils.h                         |  10 +
 csrc/ops.h                                    |   6 +-
 csrc/pybind.cpp                               |   4 +
 .../fp8_e5m2_kvcache/quant_utils.cuh          | 278 ++++++++++++++++++
 .../source/quantization/fp8_e5m2_kv_cache.rst |  32 ++
 setup.py                                      |   3 +
 tests/kernels/conftest.py                     |  41 +--
 tests/kernels/test_attention.py               |  41 ++-
 tests/kernels/test_cache.py                   |  10 +-
 vllm/config.py                                |  29 +-
 vllm/engine/arg_utils.py                      |  11 +-
 vllm/engine/llm_engine.py                     |   6 +
 vllm/model_executor/input_metadata.py         |   6 +-
 vllm/model_executor/layers/attention.py       |   3 +
 vllm/utils.py                                 | 110 ++++++-
 vllm/worker/cache_engine.py                   |  15 +-
 vllm/worker/model_runner.py                   |   7 +
 vllm/worker/worker.py                         |   5 +-
 26 files changed, 912 insertions(+), 196 deletions(-)
 create mode 100644 csrc/attention/dtype_fp8_e5m2.cuh
 create mode 100644 csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh
 create mode 100644 docs/source/quantization/fp8_e5m2_kv_cache.rst

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index d75d690cc66d4..7173134358762 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -24,6 +24,7 @@ def main(args: argparse.Namespace):
         trust_remote_code=args.trust_remote_code,
         dtype=args.dtype,
         enforce_eager=args.enforce_eager,
+        kv_cache_dtype=args.kv_cache_dtype,
     )
 
     sampling_params = SamplingParams(
@@ -117,6 +118,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--enforce-eager',
                         action='store_true',
                         help='enforce eager mode and disable CUDA graph')
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=['auto', 'fp8_e5m2'],
+        default='auto',
+        help=
+        'Data type for kv cache storage. If "auto", will use model data type.')
     parser.add_argument(
         '--profile',
         action='store_true',
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 3aac479c01bd2..d45d33307c912 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -71,6 +71,7 @@ def run_vllm(
     dtype: str,
     max_model_len: Optional[int],
     enforce_eager: bool,
+    kv_cache_dtype: str,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(
@@ -83,6 +84,7 @@ def run_vllm(
         dtype=dtype,
         max_model_len=max_model_len,
         enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
     )
 
     # Add the requests to the engine.
@@ -206,7 +208,8 @@ def main(args: argparse.Namespace):
                                 args.quantization, args.tensor_parallel_size,
                                 args.seed, args.n, args.use_beam_search,
                                 args.trust_remote_code, args.dtype,
-                                args.max_model_len, args.enforce_eager)
+                                args.max_model_len, args.enforce_eager,
+                                args.kv_cache_dtype)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -284,6 +287,13 @@ def main(args: argparse.Namespace):
     parser.add_argument("--enforce-eager",
                         action="store_true",
                         help="enforce eager execution")
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8_e5m2"],
+        default="auto",
+        help=
+        'Data type for kv cache storage. If "auto", will use model data type.')
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 935393e9942ce..56fe1b921d44e 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -1,9 +1,11 @@
+from typing import Optional
 import argparse
 import random
 import time
 
 import torch
 
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
 from vllm._C import ops
 
 NUM_BLOCKS = 1024
@@ -23,6 +25,7 @@ def main(
     dtype: torch.dtype,
     seed: int,
     do_profile: bool,
+    kv_cache_dtype: Optional[str] = None,
 ) -> None:
     random.seed(seed)
     torch.random.manual_seed(seed)
@@ -59,15 +62,10 @@ def main(
     block_tables = torch.tensor(block_tables, dtype=torch.int, device="cuda")
 
     # Create the KV cache.
-    x = 16 // torch.tensor([], dtype=dtype).element_size()
-    key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, block_size, x)
-    key_cache = torch.empty(size=key_cache_shape, dtype=dtype, device="cuda")
-    key_cache.uniform_(-scale, scale)
-    value_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size, block_size)
-    value_cache = torch.empty(size=value_cache_shape,
-                              dtype=dtype,
-                              device="cuda")
-    value_cache.uniform_(-scale, scale)
+    key_caches, value_caches = create_kv_caches_with_random(
+        NUM_BLOCKS, block_size, 1, num_kv_heads, head_size, kv_cache_dtype,
+        dtype)
+    key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Prepare for the paged attention kernel.
     output = torch.empty_like(query)
@@ -106,6 +104,7 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float:
                     block_size,
                     max_context_len,
                     alibi_slopes,
+                    kv_cache_dtype,
                 )
             elif version == "v2":
                 ops.paged_attention_v2(
@@ -123,6 +122,7 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float:
                     block_size,
                     max_context_len,
                     alibi_slopes,
+                    kv_cache_dtype,
                 )
             else:
                 raise ValueError(f"Invalid version: {version}")
@@ -168,16 +168,18 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float:
                         default="half")
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--profile", action="store_true")
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8_e5m2"],
+        default="auto",
+        help=
+        'Data type for kv cache storage. If "auto", will use model data type.')
     args = parser.parse_args()
     print(args)
 
     if args.num_query_heads % args.num_kv_heads != 0:
         raise ValueError("num_query_heads must be divisible by num_kv_heads")
-    dtype_to_torch_dtype = {
-        "half": torch.half,
-        "bfloat16": torch.bfloat16,
-        "float": torch.float,
-    }
     main(
         version=args.version,
         num_seqs=args.batch_size,
@@ -187,7 +189,8 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float:
         head_size=args.head_size,
         block_size=args.block_size,
         use_alibi=args.use_alibi,
-        dtype=dtype_to_torch_dtype[args.dtype],
+        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
         seed=args.seed,
         do_profile=args.profile,
+        kv_cache_dtype=args.kv_cache_dtype,
     )
diff --git a/csrc/attention/attention_dtypes.h b/csrc/attention/attention_dtypes.h
index 88b4eddec7fc7..61748e6b1eee6 100644
--- a/csrc/attention/attention_dtypes.h
+++ b/csrc/attention/attention_dtypes.h
@@ -4,3 +4,4 @@
 #include "dtype_float16.cuh"
 #include "dtype_float32.cuh"
 #include "dtype_bfloat16.cuh"
+#include "dtype_fp8_e5m2.cuh"
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index 9dcacfbe47d48..a5ddeac740440 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -25,6 +25,7 @@
 
 #include "attention_dtypes.h"
 #include "attention_utils.cuh"
+#include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh"
 
 #include <algorithm>
 
@@ -79,17 +80,19 @@ inline __device__ float block_sum(float* red_smem, float sum) {
 // Grid: (num_heads, num_seqs, max_num_partitions).
 template<
   typename scalar_t,
+  typename cache_t,
   int HEAD_SIZE,
   int BLOCK_SIZE,
   int NUM_THREADS,
+  bool IS_FP8_E5M2_KV_CACHE,
   int PARTITION_SIZE = 0> // Zero means no partitioning.
 __device__ void paged_attention_kernel(
   float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
   float* __restrict__ max_logits,         // [num_seqs, num_heads, max_num_partitions]
   scalar_t* __restrict__ out,             // [num_seqs, num_heads, max_num_partitions, head_size]
   const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
-  const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
-  const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
+  const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+  const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
   const int num_kv_heads,                 // [num_heads]
   const float scale,
   const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
@@ -145,6 +148,9 @@ __device__ void paged_attention_kernel(
   constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1);
   using K_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
   using Q_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
+#ifdef ENABLE_FP8_E5M2
+  using Quant_vec = typename Vec<cache_t, VEC_SIZE>::Type;
+#endif
 
   constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
   constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
@@ -176,7 +182,7 @@ __device__ void paged_attention_kernel(
 
   // x == THREAD_GROUP_SIZE * VEC_SIZE
   // Each thread group fetches x elements from the key at a time.
-  constexpr int x = 16 / sizeof(scalar_t);
+  constexpr int x = 16 / sizeof(cache_t);
   float qk_max = -FLT_MAX;
 
   // Iterate over the key blocks.
@@ -202,13 +208,23 @@ __device__ void paged_attention_kernel(
 
 #pragma unroll
       for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
-        const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
-                                        + kv_head_idx * kv_head_stride
-                                        + physical_block_offset * x;
+        const cache_t* k_ptr = k_cache + physical_block_number * kv_block_stride
+                                       + kv_head_idx * kv_head_stride
+                                       + physical_block_offset * x;
         const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
         const int offset1 = (vec_idx * VEC_SIZE) / x;
         const int offset2 = (vec_idx * VEC_SIZE) % x;
-        k_vecs[j] = *reinterpret_cast<const K_vec*>(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+        if constexpr (IS_FP8_E5M2_KV_CACHE) {
+#ifdef ENABLE_FP8_E5M2
+          Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+          // Vector conversion from Quant_vec to K_vec.
+          k_vecs[j] = fp8_e5m2_unscaled::vec_conversion<K_vec, Quant_vec>(k_vec_quant);
+#else
+          assert(false);
+#endif
+        } else {
+          k_vecs[j] = *reinterpret_cast<const K_vec*>(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+        }
       }
 
       // Compute dot product.
@@ -282,6 +298,9 @@ __device__ void paged_attention_kernel(
   constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
   using V_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
   using L_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
+#ifdef ENABLE_FP8_E5M2
+  using V_quant_vec = typename Vec<cache_t, V_VEC_SIZE>::Type;
+#endif
   using Float_L_vec = typename FloatVec<L_vec>::Type;
 
   constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
@@ -307,14 +326,25 @@ __device__ void paged_attention_kernel(
     L_vec logits_vec;
     from_float(logits_vec, *reinterpret_cast<Float_L_vec*>(logits + token_idx - start_token_idx));
 
-    const scalar_t* v_ptr = v_cache + physical_block_number * kv_block_stride
-                                    + kv_head_idx * kv_head_stride;
+    const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride
+                                   + kv_head_idx * kv_head_stride;
 #pragma unroll
     for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
       const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
       if (row_idx < HEAD_SIZE) {
         const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
-        V_vec v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
+        V_vec v_vec;
+        if constexpr (IS_FP8_E5M2_KV_CACHE) {
+#ifdef ENABLE_FP8_E5M2
+          V_quant_vec v_quant_vec = *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
+          // Vector conversion from V_quant_vec to V_vec.
+          v_vec = fp8_e5m2_unscaled::vec_conversion<V_vec, V_quant_vec>(v_quant_vec);
+#else
+          assert(false);
+#endif
+        } else {
+          v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
+        }
         if (block_idx == num_context_blocks - 1) {
           // NOTE(woosuk): When v_vec contains the tokens that are out of the context,
           // we should explicitly zero out the values since they may contain NaNs.
@@ -395,14 +425,16 @@ __device__ void paged_attention_kernel(
 // Grid: (num_heads, num_seqs, 1).
 template<
   typename scalar_t,
+  typename cache_t,
   int HEAD_SIZE,
   int BLOCK_SIZE,
-  int NUM_THREADS>
+  int NUM_THREADS,
+  bool IS_FP8_E5M2_KV_CACHE>
 __global__ void paged_attention_v1_kernel(
   scalar_t* __restrict__ out,             // [num_seqs, num_heads, head_size]
   const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
-  const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
-  const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
+  const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+  const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
   const int num_kv_heads,                 // [num_heads]
   const float scale,
   const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
@@ -412,7 +444,7 @@ __global__ void paged_attention_v1_kernel(
   const int q_stride,
   const int kv_block_stride,
   const int kv_head_stride) {
-  paged_attention_kernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS>(
+  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_E5M2_KV_CACHE>(
     /* exp_sums */ nullptr, /* max_logits */ nullptr,
     out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens,
     max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride);
@@ -421,17 +453,19 @@ __global__ void paged_attention_v1_kernel(
 // Grid: (num_heads, num_seqs, max_num_partitions).
 template<
   typename scalar_t,
+  typename cache_t,
   int HEAD_SIZE,
   int BLOCK_SIZE,
   int NUM_THREADS,
+  bool IS_FP8_E5M2_KV_CACHE,
   int PARTITION_SIZE>
 __global__ void paged_attention_v2_kernel(
   float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
   float* __restrict__ max_logits,         // [num_seqs, num_heads, max_num_partitions]
   scalar_t* __restrict__ tmp_out,         // [num_seqs, num_heads, max_num_partitions, head_size]
   const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
-  const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
-  const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
+  const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+  const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
   const int num_kv_heads,                 // [num_heads]
   const float scale,
   const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
@@ -441,7 +475,7 @@ __global__ void paged_attention_v2_kernel(
   const int q_stride,
   const int kv_block_stride,
   const int kv_head_stride) {
-  paged_attention_kernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, PARTITION_SIZE>(
+  paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_E5M2_KV_CACHE, PARTITION_SIZE>(
     exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
     block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes,
     q_stride, kv_block_stride, kv_head_stride);
@@ -550,10 +584,10 @@ __global__ void paged_attention_v2_reduce_kernel(
 
 #define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                                  \
   VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(                                       \
-    ((void*)vllm::paged_attention_v1_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS>),          \
-    shared_mem_size);                                                                         \
-  vllm::paged_attention_v1_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS>                      \
-  <<<grid, block, shared_mem_size, stream>>>(                                                 \
+    ((void*)vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,   \
+      IS_FP8_E5M2_KV_CACHE>), shared_mem_size);                                               \
+  vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,             \
+  IS_FP8_E5M2_KV_CACHE><<<grid, block, shared_mem_size, stream>>>(                            \
     out_ptr,                                                                                  \
     query_ptr,                                                                                \
     key_cache_ptr,                                                                            \
@@ -571,7 +605,9 @@ __global__ void paged_attention_v2_reduce_kernel(
 // TODO(woosuk): Tune NUM_THREADS.
 template<
   typename T,
+  typename CACHE_T,
   int BLOCK_SIZE,
+  bool IS_FP8_E5M2_KV_CACHE,
   int NUM_THREADS = 128>
 void paged_attention_v1_launcher(
   torch::Tensor& out,
@@ -602,8 +638,8 @@ void paged_attention_v1_launcher(
 
   T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
   T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
-  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
-  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
+  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
+  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
   int* block_tables_ptr = block_tables.data_ptr<int>();
   int* context_lens_ptr = context_lens.data_ptr<int>();
 
@@ -647,35 +683,35 @@ void paged_attention_v1_launcher(
   }
 }
 
-#define CALL_V1_LAUNCHER(T, BLOCK_SIZE)                             \
-  paged_attention_v1_launcher<T, BLOCK_SIZE>(                       \
-    out,                                                            \
-    query,                                                          \
-    key_cache,                                                      \
-    value_cache,                                                    \
-    num_kv_heads,                                                   \
-    scale,                                                          \
-    block_tables,                                                   \
-    context_lens,                                                   \
-    max_context_len,                                                \
+#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE)       \
+  paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE>( \
+    out,                                                                     \
+    query,                                                                   \
+    key_cache,                                                               \
+    value_cache,                                                             \
+    num_kv_heads,                                                            \
+    scale,                                                                   \
+    block_tables,                                                            \
+    context_lens,                                                            \
+    max_context_len,                                                         \
     alibi_slopes);
 
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
 // 1, 2, 4, 64, 128, 256.
-#define CALL_V1_LAUNCHER_BLOCK_SIZE(T)                              \
-  switch (block_size) {                                             \
-    case 8:                                                         \
-      CALL_V1_LAUNCHER(T, 8);                                       \
-      break;                                                        \
-    case 16:                                                        \
-      CALL_V1_LAUNCHER(T, 16);                                      \
-      break;                                                        \
-    case 32:                                                        \
-      CALL_V1_LAUNCHER(T, 32);                                      \
-      break;                                                        \
-    default:                                                        \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size);   \
-      break;                                                        \
+#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \
+  switch (block_size) {                                               \
+    case 8:                                                           \
+      CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE);          \
+      break;                                                          \
+    case 16:                                                          \
+      CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE);         \
+      break;                                                          \
+    case 32:                                                          \
+      CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE);         \
+      break;                                                          \
+    default:                                                          \
+      TORCH_CHECK(false, "Unsupported block size: ", block_size);     \
+      break;                                                          \
   }
 
 void paged_attention_v1(
@@ -689,20 +725,36 @@ void paged_attention_v1(
   torch::Tensor& context_lens,    // [num_seqs]
   int block_size,
   int max_context_len,
-  const c10::optional<torch::Tensor>& alibi_slopes) {
-  if (query.dtype() == at::ScalarType::Float) {
-    CALL_V1_LAUNCHER_BLOCK_SIZE(float);
-  } else if (query.dtype() == at::ScalarType::Half) {
-    CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t);
-  } else if (query.dtype() == at::ScalarType::BFloat16) {
-    CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16);
+  const c10::optional<torch::Tensor>& alibi_slopes,
+  const std::string& kv_cache_dtype) {
+  if (kv_cache_dtype == "auto") {
+    if (query.dtype() == at::ScalarType::Float) {
+      CALL_V1_LAUNCHER_BLOCK_SIZE(float, float, false);
+    } else if (query.dtype() == at::ScalarType::Half) {
+      CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, false);
+    } else if (query.dtype() == at::ScalarType::BFloat16) {
+      CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, false);
+    } else {
+      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
+    }
+  } else if (kv_cache_dtype == "fp8_e5m2") {
+    if (query.dtype() == at::ScalarType::Float) {
+      CALL_V1_LAUNCHER_BLOCK_SIZE(float, uint8_t, true);
+    } else if (query.dtype() == at::ScalarType::Half) {
+      CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, true);
+    } else if (query.dtype() == at::ScalarType::BFloat16) {
+      CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, true);
+    } else {
+      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
+    }
   } else {
-    TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
+    TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
   }
 }
 
 #define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE)                                                  \
-  vllm::paged_attention_v2_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, PARTITION_SIZE>      \
+  vllm::paged_attention_v2_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,             \
+  IS_FP8_E5M2_KV_CACHE, PARTITION_SIZE>                                                       \
   <<<grid, block, shared_mem_size, stream>>>(                                                 \
     exp_sums_ptr,                                                                             \
     max_logits_ptr,                                                                           \
@@ -730,7 +782,9 @@ void paged_attention_v1(
 
 template<
   typename T,
+  typename CACHE_T,
   int BLOCK_SIZE,
+  bool IS_FP8_E5M2_KV_CACHE,
   int NUM_THREADS = 128,
   int PARTITION_SIZE = 512>
 void paged_attention_v2_launcher(
@@ -768,8 +822,8 @@ void paged_attention_v2_launcher(
   float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
   T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
   T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
-  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
-  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
+  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
+  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
   int* block_tables_ptr = block_tables.data_ptr<int>();
   int* context_lens_ptr = context_lens.data_ptr<int>();
 
@@ -816,38 +870,38 @@ void paged_attention_v2_launcher(
   }
 }
 
-#define CALL_V2_LAUNCHER(T, BLOCK_SIZE)                             \
-  paged_attention_v2_launcher<T, BLOCK_SIZE>(                       \
-    out,                                                            \
-    exp_sums,                                                       \
-    max_logits,                                                     \
-    tmp_out,                                                        \
-    query,                                                          \
-    key_cache,                                                      \
-    value_cache,                                                    \
-    num_kv_heads,                                                   \
-    scale,                                                          \
-    block_tables,                                                   \
-    context_lens,                                                   \
-    max_context_len,                                                \
+#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE)           \
+  paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE>(     \
+    out,                                                                         \
+    exp_sums,                                                                    \
+    max_logits,                                                                  \
+    tmp_out,                                                                     \
+    query,                                                                       \
+    key_cache,                                                                   \
+    value_cache,                                                                 \
+    num_kv_heads,                                                                \
+    scale,                                                                       \
+    block_tables,                                                                \
+    context_lens,                                                                \
+    max_context_len,                                                             \
     alibi_slopes);
 
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
 // 1, 2, 4, 64, 128, 256.
-#define CALL_V2_LAUNCHER_BLOCK_SIZE(T)                              \
-  switch (block_size) {                                             \
-    case 8:                                                         \
-      CALL_V2_LAUNCHER(T, 8);                                       \
-      break;                                                        \
-    case 16:                                                        \
-      CALL_V2_LAUNCHER(T, 16);                                      \
-      break;                                                        \
-    case 32:                                                        \
-      CALL_V2_LAUNCHER(T, 32);                                      \
-      break;                                                        \
-    default:                                                        \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size);   \
-      break;                                                        \
+#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE)       \
+  switch (block_size) {                                                     \
+    case 8:                                                                 \
+      CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE);                \
+      break;                                                                \
+    case 16:                                                                \
+      CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE);               \
+      break;                                                                \
+    case 32:                                                                \
+      CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE);               \
+      break;                                                                \
+    default:                                                                \
+      TORCH_CHECK(false, "Unsupported block size: ", block_size);           \
+      break;                                                                \
   }
 
 void paged_attention_v2(
@@ -864,15 +918,30 @@ void paged_attention_v2(
   torch::Tensor& context_lens,    // [num_seqs]
   int block_size,
   int max_context_len,
-  const c10::optional<torch::Tensor>& alibi_slopes) {
-  if (query.dtype() == at::ScalarType::Float) {
-    CALL_V2_LAUNCHER_BLOCK_SIZE(float);
-  } else if (query.dtype() == at::ScalarType::Half) {
-    CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t);
-  } else if (query.dtype() == at::ScalarType::BFloat16) {
-    CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16);
+  const c10::optional<torch::Tensor>& alibi_slopes,
+  const std::string& kv_cache_dtype) {
+  if (kv_cache_dtype == "auto") {
+    if (query.dtype() == at::ScalarType::Float) {
+      CALL_V2_LAUNCHER_BLOCK_SIZE(float, float, false);
+    } else if (query.dtype() == at::ScalarType::Half) {
+      CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, false);
+    } else if (query.dtype() == at::ScalarType::BFloat16) {
+      CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, false);
+    } else {
+      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
+    }
+  } else if (kv_cache_dtype == "fp8_e5m2") {
+    if (query.dtype() == at::ScalarType::Float) {
+      CALL_V2_LAUNCHER_BLOCK_SIZE(float, uint8_t, true);
+    } else if (query.dtype() == at::ScalarType::Half) {
+      CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, true);
+    } else if (query.dtype() == at::ScalarType::BFloat16) {
+      CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, true);
+    } else {
+      TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
+    }
   } else {
-    TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
+    TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
   }
 }
 
diff --git a/csrc/attention/dtype_fp8_e5m2.cuh b/csrc/attention/dtype_fp8_e5m2.cuh
new file mode 100644
index 0000000000000..0580fbb8e863f
--- /dev/null
+++ b/csrc/attention/dtype_fp8_e5m2.cuh
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "attention_generic.cuh"
+
+#include <stdint.h>
+#ifdef ENABLE_FP8_E5M2
+#include <cuda_fp8.h>
+#endif
+
+namespace vllm {
+#ifdef ENABLE_FP8_E5M2
+// fp8 vector types for quantization of kv cache
+
+template<>
+struct Vec<uint8_t, 1> {
+    using Type = uint8_t;
+};
+
+template<>
+struct Vec<uint8_t, 2> {
+    using Type = uint16_t;
+};
+
+template<>
+struct Vec<uint8_t, 4> {
+    using Type = uint32_t;
+};
+
+template<>
+struct Vec<uint8_t, 8> {
+    using Type = uint2;
+};
+#endif // ENABLE_FP8_E5M2
+
+} // namespace vllm
diff --git a/csrc/cache.h b/csrc/cache.h
index b26faad2ca814..21c71830f7942 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -20,7 +20,8 @@ void reshape_and_cache(
   torch::Tensor& value,
   torch::Tensor& key_cache,
   torch::Tensor& value_cache,
-  torch::Tensor& slot_mapping);
+  torch::Tensor& slot_mapping,
+  const std::string& kv_cache_dtype);
 
 void gather_cached_kv(
   torch::Tensor& key,
@@ -28,3 +29,8 @@ void gather_cached_kv(
   torch::Tensor& key_cache,
   torch::Tensor& value_cache,
   torch::Tensor& slot_mapping);
+
+// Just for unittest
+void convert_fp8_e5m2(
+  torch::Tensor& src_cache,
+  torch::Tensor& dst_cache);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index b7523cb4c3b52..fe0159e404585 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -4,6 +4,7 @@
 
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
+#include "quantization/fp8_e5m2_kvcache/quant_utils.cuh"
 
 #include <algorithm>
 #include <cassert>
@@ -131,7 +132,7 @@ void copy_blocks(
   dim3 block(std::min(1024, numel_per_block));
   const at::cuda::OptionalCUDAGuard device_guard(cache_device);
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
+  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
     key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
       vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
         key_cache_ptrs_tensor.data_ptr<int64_t>(),
@@ -143,12 +144,12 @@ void copy_blocks(
 
 namespace vllm {
 
-template<typename scalar_t>
+template<typename scalar_t, typename cache_t, bool is_fp8_e5m2_kv_cache>
 __global__ void reshape_and_cache_kernel(
   const scalar_t* __restrict__ key,           // [num_tokens, num_heads, head_size]
   const scalar_t* __restrict__ value,         // [num_tokens, num_heads, head_size]
-  scalar_t* __restrict__ key_cache,           // [num_blocks, num_heads, head_size/x, block_size, x]
-  scalar_t* __restrict__ value_cache,         // [num_blocks, num_heads, head_size, block_size]
+  cache_t* __restrict__ key_cache,            // [num_blocks, num_heads, head_size/x, block_size, x]
+  cache_t* __restrict__ value_cache,          // [num_blocks, num_heads, head_size, block_size]
   const int64_t* __restrict__ slot_mapping,   // [num_tokens]
   const int key_stride,
   const int value_stride,
@@ -185,19 +186,45 @@ __global__ void reshape_and_cache_kernel(
                                   + head_idx * head_size * block_size
                                   + head_offset * block_size
                                   + block_offset;
-    key_cache[tgt_key_idx] = key[src_key_idx];
-    value_cache[tgt_value_idx] = value[src_value_idx];
+    scalar_t tgt_key = key[src_key_idx];
+    scalar_t tgt_value = value[src_value_idx];
+    if constexpr (is_fp8_e5m2_kv_cache) {
+#ifdef ENABLE_FP8_E5M2
+      key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion<uint8_t, scalar_t>(tgt_key);
+      value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion<uint8_t, scalar_t>(tgt_value);
+#else
+      assert(false);
+#endif
+    } else {
+      key_cache[tgt_key_idx] = tgt_key;
+      value_cache[tgt_value_idx] = tgt_value;
+    }
   }
 }
 
 } // namespace vllm
 
+#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_E5M2_KV_CACHE)                                \
+  vllm::reshape_and_cache_kernel<KV_T, CACHE_T, IS_FP8_E5M2_KV_CACHE><<<grid, block, 0, stream>>>( \
+    reinterpret_cast<KV_T*>(key.data_ptr()),                                                       \
+    reinterpret_cast<KV_T*>(value.data_ptr()),                                                     \
+    reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),                                              \
+    reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),                                            \
+    slot_mapping.data_ptr<int64_t>(),                                                              \
+    key_stride,                                                                                    \
+    value_stride,                                                                                  \
+    num_heads,                                                                                     \
+    head_size,                                                                                     \
+    block_size,                                                                                    \
+    x);
+
 void reshape_and_cache(
   torch::Tensor& key,           // [num_tokens, num_heads, head_size]
   torch::Tensor& value,         // [num_tokens, num_heads, head_size]
   torch::Tensor& key_cache,     // [num_blocks, num_heads, head_size/x, block_size, x]
   torch::Tensor& value_cache,   // [num_blocks, num_heads, head_size, block_size]
-  torch::Tensor& slot_mapping)  // [num_tokens]
+  torch::Tensor& slot_mapping,  // [num_tokens]
+  const std::string& kv_cache_dtype)
 {
   int num_tokens = key.size(0);
   int num_heads = key.size(1);
@@ -212,23 +239,25 @@ void reshape_and_cache(
   dim3 block(std::min(num_heads * head_size, 512));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
-    key.scalar_type(),
-    "reshape_and_cache_kernel",
-    [&] {
-      vllm::reshape_and_cache_kernel<scalar_t><<<grid, block, 0, stream>>>(
-        key.data_ptr<scalar_t>(),
-        value.data_ptr<scalar_t>(),
-        key_cache.data_ptr<scalar_t>(),
-        value_cache.data_ptr<scalar_t>(),
-        slot_mapping.data_ptr<int64_t>(),
-        key_stride,
-        value_stride,
-        num_heads,
-        head_size,
-        block_size,
-        x);
-    });
+  if (kv_cache_dtype == "auto") {
+    if (key.dtype() == at::ScalarType::Float) {
+      CALL_RESHAPE_AND_CACHE(float, float, false);
+    } else if (key.dtype() == at::ScalarType::Half) {
+      CALL_RESHAPE_AND_CACHE(uint16_t, uint16_t, false);
+    } else if (key.dtype() == at::ScalarType::BFloat16) {
+      CALL_RESHAPE_AND_CACHE(__nv_bfloat16, __nv_bfloat16, false);
+    }
+  } else if (kv_cache_dtype == "fp8_e5m2") {
+    if (key.dtype() == at::ScalarType::Float) {
+      CALL_RESHAPE_AND_CACHE(float, uint8_t, true);
+    } else if (key.dtype() == at::ScalarType::Half) {
+      CALL_RESHAPE_AND_CACHE(uint16_t, uint8_t, true);
+    } else if (key.dtype() == at::ScalarType::BFloat16) {
+      CALL_RESHAPE_AND_CACHE(__nv_bfloat16, uint8_t, true);
+    }
+  } else {
+    TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
+  }
 }
 
 namespace vllm {
@@ -256,12 +285,12 @@ __global__ void gather_cached_kv_kernel(
     for (int i = threadIdx.x; i < num_tokens; i += blockDim.x) {
       const int tgt_key_idx = token_idx * key_stride + i;
       const int tgt_value_idx = token_idx * value_stride + i;
-  
+
       const int head_idx = i / head_size;
       const int head_offset = i % head_size;
       const int x_idx = head_offset / x;  // the offset of the [head_size/x] dimension
       const int x_offset = head_offset % x;
-  
+
       const int src_key_idx = block_idx * num_heads * (head_size / x) * block_size * x
                               + head_idx * (head_size / x) * block_size * x
                               + x_idx * block_size * x
@@ -373,7 +402,7 @@ void gather_cached_kv(
   dim3 block(std::min(num_heads * head_size, 512));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
+  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
     key.scalar_type(),
     "gather_cached_kv_kernel_optimized",
     [&] {
@@ -391,3 +420,55 @@ void gather_cached_kv(
         x);
     });
 }
+
+namespace vllm {
+
+template<typename Tout, typename Tin>
+__global__ void convert_fp8_e5m2_kernel(
+  const Tin* __restrict__ src_cache,
+  Tout* __restrict__ dst_cache,
+  const int64_t block_stride) {
+  const int64_t block_idx = blockIdx.x;
+  for (int i = threadIdx.x; i < block_stride; i += blockDim.x) {
+    int64_t idx = block_idx * block_stride + i;
+#ifdef ENABLE_FP8_E5M2
+    dst_cache[idx] = fp8_e5m2_unscaled::vec_conversion<Tout, Tin>(src_cache[idx]);
+#else
+    assert(false);
+#endif
+  }
+}
+
+} // namespace vllm
+
+#define CALL_CONVERT_FP8_E5M2(Tout, Tin)                                 \
+  vllm::convert_fp8_e5m2_kernel<Tout, Tin><<<grid, block, 0, stream>>>(  \
+    reinterpret_cast<Tin*>(src_cache.data_ptr()),                        \
+    reinterpret_cast<Tout*>(dst_cache.data_ptr()),                       \
+    block_stride);
+
+void convert_fp8_e5m2(
+  torch::Tensor& src_cache,
+  torch::Tensor& dst_cache)
+{
+  int64_t num_blocks = src_cache.size(0);
+  int64_t block_stride = src_cache.stride(0);
+
+  dim3 grid(num_blocks);
+  dim3 block(std::min(block_stride, int64_t(512)));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (src_cache.dtype() == at::ScalarType::Float) {
+    CALL_CONVERT_FP8_E5M2(uint8_t, float);
+  } else if (src_cache.dtype() == at::ScalarType::Half) {
+    CALL_CONVERT_FP8_E5M2(uint8_t, uint16_t);
+  } else if (src_cache.dtype() == at::ScalarType::BFloat16) {
+    CALL_CONVERT_FP8_E5M2(uint8_t, __nv_bfloat16);
+  } else if (dst_cache.dtype() == at::ScalarType::Float) {
+    CALL_CONVERT_FP8_E5M2(float, uint8_t);
+  } else if (dst_cache.dtype() == at::ScalarType::Half) {
+    CALL_CONVERT_FP8_E5M2(uint16_t, uint8_t);
+  } else if (dst_cache.dtype() == at::ScalarType::BFloat16) {
+    CALL_CONVERT_FP8_E5M2(__nv_bfloat16, uint8_t);
+  }
+}
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index 0ae9cd6415982..85fdfc091e94c 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -14,3 +14,13 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)             \
   AT_DISPATCH_SWITCH(                                             \
     TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)     \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)      \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)   \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...)           \
+  AT_DISPATCH_SWITCH(                                                    \
+    TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__))
diff --git a/csrc/ops.h b/csrc/ops.h
index 6e996fd0d577b..ce77dd47d3550 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -13,7 +13,8 @@ void paged_attention_v1(
   torch::Tensor& context_lens,
   int block_size,
   int max_context_len,
-  const c10::optional<torch::Tensor>& alibi_slopes);
+  const c10::optional<torch::Tensor>& alibi_slopes,
+  const std::string& kv_cache_dtype);
 
 void paged_attention_v2(
   torch::Tensor& out,
@@ -29,7 +30,8 @@ void paged_attention_v2(
   torch::Tensor& context_lens,
   int block_size,
   int max_context_len,
-  const c10::optional<torch::Tensor>& alibi_slopes);
+  const c10::optional<torch::Tensor>& alibi_slopes,
+  const std::string& kv_cache_dtype);
 
 void rms_norm(
   torch::Tensor& out,
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index f94efadfa101a..db2da8f06bcf0 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -75,6 +75,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     "gather_cached_kv",
     &gather_cached_kv,
     "Gather key and value from the cache into contiguous QKV tensors");
+  cache_ops.def(
+    "convert_fp8_e5m2",
+    &convert_fp8_e5m2,
+    "Convert the key and value cache to fp8_e5m2 data type");
 
   // Cuda utils
   pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils");
diff --git a/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh b/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh
new file mode 100644
index 0000000000000..c3b0d311b89cc
--- /dev/null
+++ b/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh
@@ -0,0 +1,278 @@
+#pragma once
+
+#include <assert.h>
+#include <stdint.h>
+#include <float.h>
+#include <type_traits>
+#include "../../attention/attention_dtypes.h"
+#include "../../attention/dtype_float32.cuh"
+#include "../../attention/dtype_float16.cuh"
+#include "../../attention/dtype_bfloat16.cuh"
+
+#pragma once
+
+namespace vllm {
+#ifdef ENABLE_FP8_E5M2
+namespace fp8_e5m2_unscaled {
+
+template<typename Tout, typename Tin>
+__inline__ __device__ Tout vec_conversion(const Tin& x)
+{
+    return x;
+}
+
+// fp8 -> half
+template<>
+__inline__ __device__ uint16_t vec_conversion<uint16_t, uint8_t>(const uint8_t& a)
+{
+    __half_raw res = __nv_cvt_fp8_to_halfraw(a, __NV_E5M2);
+    return res.x;
+}
+
+// fp8x2 -> half2
+template<>
+__inline__ __device__ uint32_t vec_conversion<uint32_t, uint16_t>(const uint16_t& a)
+{
+    union {
+        uint16_t u16[2];
+        uint32_t u32;
+    } tmp;
+    __half2_raw res = __nv_cvt_fp8x2_to_halfraw2(a, __NV_E5M2);
+    tmp.u16[0] = res.x;
+    tmp.u16[1] = res.y;
+    return tmp.u32;
+}
+
+// fp8x4 -> half2x2
+template<>
+__inline__ __device__ uint2 vec_conversion<uint2, uint32_t>(const uint32_t& a)
+{
+    union {
+        uint2    u32x2;
+        uint32_t u32[2];
+    } tmp;
+    tmp.u32[0] = vec_conversion<uint32_t, uint16_t>((uint16_t)a);
+    tmp.u32[1] = vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U));
+    return tmp.u32x2;
+}
+
+// fp8x8 -> half2x4
+template<>
+__inline__ __device__ uint4 vec_conversion<uint4, uint2>(const uint2& a)
+{
+    union {
+        uint4 u64x2;
+        uint2 u64[2];
+    } tmp;
+    tmp.u64[0] = vec_conversion<uint2, uint32_t>(a.x);
+    tmp.u64[1] = vec_conversion<uint2, uint32_t>(a.y);
+    return tmp.u64x2;
+}
+
+// fp8 -> __nv_bfloat16
+template<>
+__inline__ __device__ __nv_bfloat16 vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a)
+{
+    // Note there is no direct convert function from fp8 to bf16.
+    // fp8 -> half
+    __half_raw res = __nv_cvt_fp8_to_halfraw(a, __NV_E5M2);
+    // half -> float -> bf16
+    float tmp = half_to_float(res.x);
+    return __float2bfloat16(tmp);
+}
+
+// fp8x2 -> __nv_bfloat162
+template<>
+__inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a)
+{
+    __nv_bfloat162 res;
+    res.x = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a);
+    res.y = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U));
+    return res;
+}
+
+// fp8x4 -> bf16_4_t
+template<>
+__inline__ __device__ bf16_4_t vec_conversion<bf16_4_t, uint32_t>(const uint32_t& a)
+{
+    bf16_4_t res;
+    res.x = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a);
+    res.y = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U));
+    return res;
+}
+
+// fp8x8 -> bf16_8_t
+template<>
+__inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, uint2>(const uint2& a)
+{
+    bf16_4_t tmp1, tmp2;
+    tmp1 = vec_conversion<bf16_4_t, uint32_t>(a.x);
+    tmp2 = vec_conversion<bf16_4_t, uint32_t>(a.y);
+    bf16_8_t res;
+    res.x = tmp1.x;
+    res.y = tmp1.y;
+    res.z = tmp2.x;
+    res.w = tmp2.y;
+    return res;
+}
+
+// fp8 -> float
+template<>
+__inline__ __device__ float vec_conversion<float, uint8_t>(const uint8_t& a)
+{
+    // fp8 -> half
+    uint16_t tmp = vec_conversion<uint16_t, uint8_t>(a);
+    // half -> float
+    return half_to_float(tmp);
+}
+
+// fp8x2 -> float2
+template<>
+__inline__ __device__ float2 vec_conversion<float2, uint16_t>(const uint16_t& a)
+{
+    // fp8x2 -> half2
+    uint32_t tmp = vec_conversion<uint32_t, uint16_t>(a);
+    // half2 -> float2
+    return half2_to_float2(tmp);
+}
+
+// fp8x4 -> float4
+template<>
+__inline__ __device__ Float4_ vec_conversion<Float4_, uint32_t>(const uint32_t& a)
+{
+    Float4_ res;
+    res.x = vec_conversion<float2, uint16_t>((uint16_t)a);
+    res.y = vec_conversion<float2, uint16_t>((uint16_t)(a >> 16U));
+    return res;
+}
+
+// fp8x8 -> float8
+template<>
+__inline__ __device__ Float8_ vec_conversion<Float8_, uint2>(const uint2& a)
+{
+    Float4_ tmp1, tmp2;
+    tmp1 = vec_conversion<Float4_, uint32_t>(a.x);
+    tmp2 = vec_conversion<Float4_, uint32_t>(a.y);
+    Float8_ res;
+    res.x = tmp1.x;
+    res.y = tmp1.y;
+    res.z = tmp2.x;
+    res.w = tmp2.y;
+    return res;
+}
+
+
+// half -> fp8
+template<>
+__inline__ __device__ uint8_t vec_conversion<uint8_t, uint16_t>(const uint16_t& a)
+{
+    __half_raw tmp;
+    tmp.x = a;
+    __nv_fp8_storage_t res = __nv_cvt_halfraw_to_fp8(tmp, __NV_SATFINITE, __NV_E5M2);
+    return (uint8_t)res;
+}
+
+// bf16 -> fp8
+template<>
+__inline__ __device__ uint8_t vec_conversion<uint8_t, __nv_bfloat16>(const __nv_bfloat16& a)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    assert(false);
+#else
+    __nv_fp8_storage_t res = __nv_cvt_bfloat16raw_to_fp8(__nv_bfloat16_raw(a), __NV_SATFINITE, __NV_E5M2);
+    return (uint8_t)res;
+#endif
+}
+
+// float -> fp8
+template<>
+__inline__ __device__ uint8_t vec_conversion<uint8_t, float>(const float& a)
+{
+    __nv_fp8_storage_t res = __nv_cvt_float_to_fp8(a, __NV_SATFINITE, __NV_E5M2);
+    return (uint8_t)res;
+}
+
+// fp8x4 -> float4
+template<>
+__inline__ __device__ float4 vec_conversion<float4, uint32_t>(const uint32_t& a)
+{
+    Float4_ tmp = vec_conversion<Float4_, uint32_t>(a);
+    float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
+    return res;
+}
+
+
+template<>
+__inline__ __device__ uint32_t vec_conversion<uint32_t, float2>(const float2& a)
+{
+    union {
+        half2    float16;
+        uint32_t uint32;
+    };
+
+    float16 = __float22half2_rn(a);
+    return uint32;
+}
+
+template<>
+__inline__ __device__ uint2 vec_conversion<uint2, Float4_>(const Float4_& a)
+{
+    uint2  b;
+    float2 val;
+    val.x = a.x.x;
+    val.y = a.x.y;
+    b.x   = vec_conversion<uint32_t, float2>(val);
+
+    val.x = a.y.x;
+    val.y = a.y.y;
+    b.y   = vec_conversion<uint32_t, float2>(val);
+
+    return b;
+}
+
+template<>
+__inline__ __device__ float4 vec_conversion<float4, Float4_>(const Float4_& a)
+{
+    float4 b;
+    b.x = a.x.x;
+    b.y = a.x.y;
+    b.z = a.y.x;
+    b.w = a.y.y;
+    return b;
+}
+
+template<>
+__inline__ __device__ uint4 vec_conversion<uint4, Float8_>(const Float8_& a)
+{
+    uint4 b;
+    b.x = vec_conversion<uint32_t, float2>(a.x);
+    b.y = vec_conversion<uint32_t, float2>(a.y);
+    b.z = vec_conversion<uint32_t, float2>(a.z);
+    b.w = vec_conversion<uint32_t, float2>(a.w);
+    return b;
+}
+
+template<>
+__inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, float2>(const float2 &a) {
+    __nv_bfloat162 b;
+    from_float(b, a);
+    return b;
+}
+
+template<>
+__inline__ __device__ bf16_4_t vec_conversion<bf16_4_t, Float4_>(const Float4_ &a) {
+    bf16_4_t b;
+    from_float(b, a);
+    return b;
+}
+
+template<>
+__inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, Float8_>(const Float8_ &a) {
+    bf16_8_t b;
+    from_float(b, a);
+    return b;
+}
+
+} // namespace fp8_e5m2_unscaled
+#endif // ENABLE_FP8_E5M2
+} // namespace vllm
diff --git a/docs/source/quantization/fp8_e5m2_kv_cache.rst b/docs/source/quantization/fp8_e5m2_kv_cache.rst
new file mode 100644
index 0000000000000..10437260ad964
--- /dev/null
+++ b/docs/source/quantization/fp8_e5m2_kv_cache.rst
@@ -0,0 +1,32 @@
+.. _fp8_e5m2_kv_cache:
+
+FP8 E5M2 KV Cache
+==================
+
+The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits.
+The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other.
+
+Here is an example of how to enable this feature:
+
+.. code-block:: python
+    from vllm import LLM, SamplingParams
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8_e5m2")
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
diff --git a/setup.py b/setup.py
index 2f6242690a263..b552cb67cb363 100644
--- a/setup.py
+++ b/setup.py
@@ -253,6 +253,9 @@ def get_torch_arch_list() -> Set[str]:
         num_threads = min(os.cpu_count(), nvcc_threads)
         NVCC_FLAGS += ["--threads", str(num_threads)]
 
+    if nvcc_cuda_version >= Version("11.8"):
+        NVCC_FLAGS += ["-DENABLE_FP8_E5M2"]
+
     # changes for punica kernels
     NVCC_FLAGS += torch_cpp_ext.COMMON_NVCC_FLAGS
     REMOVE_NVCC_FLAGS = [
diff --git a/tests/kernels/conftest.py b/tests/kernels/conftest.py
index fca97ab76bf09..8c51bfc149efe 100644
--- a/tests/kernels/conftest.py
+++ b/tests/kernels/conftest.py
@@ -1,44 +1,7 @@
-from typing import List, Tuple
-
 import pytest
-import torch
-
-
-def create_kv_caches(
-    num_blocks: int,
-    block_size: int,
-    num_layers: int,
-    num_heads: int,
-    head_size: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-
-    scale = head_size**-0.5
-    x = 16 // torch.tensor([], dtype=dtype).element_size()
-    key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
-    key_caches = []
-    for _ in range(num_layers):
-        key_cache = torch.empty(size=key_cache_shape,
-                                dtype=dtype,
-                                device=device)
-        key_cache.uniform_(-scale, scale)
-        key_caches.append(key_cache)
-
-    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
-    value_caches = []
-    for _ in range(num_layers):
-        value_cache = torch.empty(size=value_cache_shape,
-                                  dtype=dtype,
-                                  device=device)
-        value_cache.uniform_(-scale, scale)
-        value_caches.append(value_cache)
-    return key_caches, value_caches
+from vllm.utils import create_kv_caches_with_random
 
 
 @pytest.fixture()
 def kv_cache_factory():
-    return create_kv_caches
+    return create_kv_caches_with_random
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 3949948e860f7..cbb1d40623c71 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -6,14 +6,16 @@
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 
-from vllm._C import ops
+from vllm._C import ops, cache_ops
 from vllm.utils import get_max_shared_memory_bytes
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
 MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
-NUM_BLOCKS = 12000  # Arbitrary values for testing
+# There may not be enough gpu memory due to large NUM_BLOCKS.
+# Reduce NUM_BLOCKS when it happens.
+NUM_BLOCKS = 4321  # Arbitrary values for testing
 PARTITION_SIZE = 512
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -23,6 +25,7 @@
 HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
+KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
 SEEDS = [0]
 DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
@@ -105,6 +108,7 @@ def ref_single_query_cached_kv_attention(
 @pytest.mark.parametrize("use_alibi", USE_ALIBI)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", DEVICES)
 def test_paged_attention(
@@ -116,6 +120,7 @@ def test_paged_attention(
     use_alibi: bool,
     block_size: int,
     dtype: torch.dtype,
+    kv_cache_dtype: str,
     seed: int,
     device: int,
 ) -> None:
@@ -158,8 +163,9 @@ def test_paged_attention(
 
     # Create the KV caches.
     key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
-                                                num_kv_heads, head_size, dtype,
-                                                seed, gpu_id)
+                                                num_kv_heads, head_size,
+                                                kv_cache_dtype, dtype, seed,
+                                                gpu_id)
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Call the paged attention kernel.
@@ -177,6 +183,7 @@ def test_paged_attention(
             block_size,
             max_context_len,
             alibi_slopes,
+            kv_cache_dtype,
         )
     elif version == "v2":
         num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
@@ -209,11 +216,30 @@ def test_paged_attention(
             block_size,
             max_context_len,
             alibi_slopes,
+            kv_cache_dtype,
         )
     else:
         raise AssertionError(f"Unknown version: {version}")
 
     # Run the reference implementation.
+    if kv_cache_dtype == "fp8_e5m2":
+        # Convert cache data back to dtype.
+        x = 16 // torch.tensor([], dtype=dtype).element_size()
+        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x,
+                           block_size, x)
+        dequantized_key_cache = torch.empty(size=key_cache_shape,
+                                            dtype=dtype,
+                                            device=gpu_id)
+        cache_ops.convert_fp8_e5m2(key_cache, dequantized_key_cache)
+        key_cache = dequantized_key_cache
+
+        value_cache_shape = value_cache.shape
+        dequantized_value_cache = torch.empty(size=value_cache_shape,
+                                              dtype=dtype,
+                                              device=gpu_id)
+        cache_ops.convert_fp8_e5m2(value_cache, dequantized_value_cache)
+        value_cache = dequantized_value_cache
+
     ref_output = torch.empty_like(query)
     ref_single_query_cached_kv_attention(
         ref_output,
@@ -230,7 +256,12 @@ def test_paged_attention(
     # NOTE(woosuk): Due to the kernel-level differences in the two
     # implementations, there is a small numerical difference in the two
     # outputs. Thus, we use a relaxed tolerance for the test.
-    assert torch.allclose(output, ref_output, atol=1e-3, rtol=1e-5)
+    # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
+    # so we use a relaxed tolerance for the test.
+    atol, rtol = 1e-3, 1e-5
+    if kv_cache_dtype == "fp8_e5m2":
+        atol, rtol = 1e-2, 1e-5
+    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
 
 
 def ref_multi_query_kv_attention(
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 7b1cc058f2cb5..193bc29bd431d 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -15,6 +15,7 @@
 NUM_MAPPINGS = [256]  # Arbitrary values for testing
 SEEDS = [0]
 DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
 
 
 @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
@@ -26,6 +27,7 @@
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @torch.inference_mode()
 def test_copy_blocks(
     kv_cache_factory,
@@ -38,6 +40,7 @@ def test_copy_blocks(
     dtype: torch.dtype,
     seed: int,
     device: int,
+    kv_cache_dtype: str,
 ) -> None:
     random.seed(seed)
     torch.random.manual_seed(seed)
@@ -59,7 +62,8 @@ def test_copy_blocks(
     # Create the KV caches.
     key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
                                                 num_layers, num_heads,
-                                                head_size, dtype, seed, gpu_id)
+                                                head_size, kv_cache_dtype,
+                                                dtype, seed, gpu_id)
 
     # Clone the KV caches.
     cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
@@ -124,7 +128,7 @@ def test_reshape_and_cache(
     # Create the KV caches.
     key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
                                                 num_heads, head_size, dtype,
-                                                seed, gpu_id)
+                                                None, seed, gpu_id)
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Clone the KV caches.
@@ -133,7 +137,7 @@ def test_reshape_and_cache(
 
     # Call the reshape_and_cache kernel.
     cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
-                                slot_mapping)
+                                slot_mapping, "auto")
 
     # Run the reference implementation.
     reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
diff --git a/vllm/config.py b/vllm/config.py
index da97eaa77fd35..197f20c1ec9a5 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,13 +1,14 @@
 from typing import Optional, Union, ClassVar
 from dataclasses import dataclass
 import os
+from packaging.version import Version
 
 import torch
 from transformers import PretrainedConfig
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import get_config
-from vllm.utils import get_cpu_memory, is_hip
+from vllm.utils import get_cpu_memory, is_hip, get_nvcc_cuda_version
 
 logger = init_logger(__name__)
 
@@ -275,6 +276,7 @@ class CacheConfig:
         gpu_memory_utilization: Fraction of GPU memory to use for the
             vLLM execution.
         swap_space: Size of the CPU swap space per GPU (in GiB).
+        cache_dtype: Data type for kv cache storage.
     """
 
     def __init__(
@@ -282,13 +284,16 @@ def __init__(
         block_size: int,
         gpu_memory_utilization: float,
         swap_space: int,
+        cache_dtype: str,
         sliding_window: Optional[int] = None,
     ) -> None:
         self.block_size = block_size
         self.gpu_memory_utilization = gpu_memory_utilization
         self.swap_space_bytes = swap_space * _GB
+        self.cache_dtype = cache_dtype
         self.sliding_window = sliding_window
         self._verify_args()
+        self._verify_cache_dtype()
 
         # Will be set after profiling.
         self.num_gpu_blocks = None
@@ -300,6 +305,28 @@ def _verify_args(self) -> None:
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
 
+    def _verify_cache_dtype(self) -> None:
+        if self.cache_dtype == "auto":
+            pass
+        elif self.cache_dtype == "fp8_e5m2":
+            nvcc_cuda_version = get_nvcc_cuda_version()
+            if nvcc_cuda_version < Version("11.8"):
+                raise ValueError(
+                    "FP8 is not supported when cuda version is lower than 11.8."
+                )
+            device_name = torch.cuda.get_device_name()
+            if "AMD" in device_name:
+                raise NotImplementedError(
+                    "FP8_E5M2 KV Cache on AMD GPU has not been supported yet.")
+            logger.info(
+                "Using fp8_e5m2 data type to store kv cache. It reduces "
+                "the GPU memory footprint and boosts the performance. "
+                "But it may cause slight accuracy drop. "
+                "Currently we only support fp8 without scaling factors and "
+                "make e5m2 as a default format.")
+        else:
+            raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
+
     def verify_with_parallel_config(
         self,
         parallel_config: "ParallelConfig",
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 968362c468deb..231ce3321cdc4 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -17,6 +17,7 @@ class EngineArgs:
     download_dir: Optional[str] = None
     load_format: str = 'auto'
     dtype: str = 'auto'
+    kv_cache_dtype: str = 'auto'
     seed: int = 0
     max_model_len: Optional[int] = None
     worker_use_ray: bool = False
@@ -122,6 +123,14 @@ def add_cli_args(
             'The "auto" option will use FP16 precision '
             'for FP32 and FP16 models, and BF16 precision '
             'for BF16 models.')
+        parser.add_argument(
+            '--kv-cache-dtype',
+            type=str,
+            choices=['auto', 'fp8_e5m2'],
+            default='auto',
+            help='Data type for kv cache storage. If "auto", will use model '
+            'data type. Note FP8 is not supported when cuda version is '
+            'lower than 11.8.')
         parser.add_argument('--max-model-len',
                             type=int,
                             default=None,
@@ -269,7 +278,7 @@ def create_engine_configs(
                                    self.max_context_len_to_capture)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
-                                   self.swap_space,
+                                   self.swap_space, self.kv_cache_dtype,
                                    model_config.get_sliding_window())
         parallel_config = ParallelConfig(self.pipeline_parallel_size,
                                          self.tensor_parallel_size,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 87752eea02811..5b73ef08f9d84 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -85,6 +85,7 @@ def __init__(
             f"disable_custom_all_reduce={parallel_config.disable_custom_all_reduce}, "
             f"quantization={model_config.quantization}, "
             f"enforce_eager={model_config.enforce_eager}, "
+            f"kv_cache_dtype={cache_config.cache_dtype}, "
             f"seed={model_config.seed})")
         # TODO(woosuk): Print more configs in debug mode.
 
@@ -144,6 +145,7 @@ def _init_workers(self):
             rank=0,
             distributed_init_method=distributed_init_method,
             lora_config=self.lora_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=True,
         )
         self._run_workers("init_model")
@@ -234,6 +236,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         model_config = copy.deepcopy(self.model_config)
         parallel_config = copy.deepcopy(self.parallel_config)
         scheduler_config = copy.deepcopy(self.scheduler_config)
+        cache_config = copy.deepcopy(self.cache_config)
 
         for rank, (worker, (node_id,
                             _)) in enumerate(zip(self.workers,
@@ -249,6 +252,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     rank,
                     distributed_init_method,
                     lora_config=self.lora_config,
+                    cache_config=cache_config,
                 ))
 
         driver_rank = 0
@@ -261,6 +265,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             driver_rank,
             distributed_init_method,
             lora_config=self.lora_config,
+            cache_config=cache_config,
             is_driver_worker=True,
         )
 
@@ -306,6 +311,7 @@ def _init_cache(self) -> None:
             block_size=self.cache_config.block_size,
             gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
             cpu_swap_space=self.cache_config.swap_space_bytes,
+            cache_dtype=self.cache_config.cache_dtype,
         )
 
         # Since we use a shared centralized controller, we take the minimum
diff --git a/vllm/model_executor/input_metadata.py b/vllm/model_executor/input_metadata.py
index ef49cc5902ea6..f0a88ac8e27f8 100644
--- a/vllm/model_executor/input_metadata.py
+++ b/vllm/model_executor/input_metadata.py
@@ -12,6 +12,7 @@ class InputMetadata:
         max_context_len: The maximum context length.
         context_lens: the length of attention context for each sequence.
         block_tables: The block tables. (Seq id -> list of physical block)
+        kv_cache_dtype: Data type to store kv cache.
     """
 
     def __init__(
@@ -25,6 +26,7 @@ def __init__(
         context_lens: Optional[torch.Tensor],
         block_tables: Optional[torch.Tensor],
         use_cuda_graph: bool,
+        kv_cache_dtype: str,
     ) -> None:
         self.is_prompt = is_prompt
         self.prompt_lens = prompt_lens
@@ -35,6 +37,7 @@ def __init__(
         self.context_lens = context_lens
         self.block_tables = block_tables
         self.use_cuda_graph = use_cuda_graph
+        self.kv_cache_dtype = kv_cache_dtype
 
         # Set during the execution of the first attention op.
         # FIXME(woosuk): This is a hack.
@@ -47,4 +50,5 @@ def __repr__(self) -> str:
                 f"slot_mapping={self.slot_mapping}, "
                 f"context_lens={self.context_lens}, "
                 f"block_tables={self.block_tables}, "
-                f"use_cuda_graph={self.use_cuda_graph})")
+                f"use_cuda_graph={self.use_cuda_graph}, "
+                f"kv_cache_dtype={self.kv_cache_dtype})")
diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
index 8b5c6ab30d7b7..91ed43f07c76e 100644
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -98,6 +98,7 @@ def forward(
                 key_cache,
                 value_cache,
                 input_metadata.slot_mapping.flatten(),
+                input_metadata.kv_cache_dtype,
             )
 
         if input_metadata.is_prompt:
@@ -265,6 +266,7 @@ def _paged_attention(
             block_size,
             input_metadata.max_context_len,
             alibi_slopes,
+            input_metadata.kv_cache_dtype,
         )
     else:
         # Run PagedAttention V2.
@@ -295,5 +297,6 @@ def _paged_attention(
             block_size,
             input_metadata.max_context_len,
             alibi_slopes,
+            input_metadata.kv_cache_dtype,
         )
     return output
diff --git a/vllm/utils.py b/vllm/utils.py
index 6a9508f6d33b4..dc81741498356 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,9 +1,11 @@
 import enum
 import os
 import socket
+import subprocess
 import uuid
 from platform import uname
-from typing import List
+from typing import List, Tuple, Union
+from packaging.version import parse, Version
 
 import psutil
 import torch
@@ -17,7 +19,17 @@
 from collections import OrderedDict
 from typing import Any, Hashable, Optional
 
+from vllm.logger import init_logger
+
 T = TypeVar("T")
+logger = init_logger(__name__)
+
+STR_DTYPE_TO_TORCH_DTYPE = {
+    "half": torch.half,
+    "bfloat16": torch.bfloat16,
+    "float": torch.float,
+    "fp8_e5m2": torch.uint8,
+}
 
 
 class Device(enum.Enum):
@@ -167,3 +179,99 @@ def get_open_port() -> int:
 
 def set_cuda_visible_devices(device_ids: List[int]) -> None:
     os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, device_ids))
+
+
+def get_nvcc_cuda_version() -> Version:
+    cuda_home = os.environ.get('CUDA_HOME')
+    if not cuda_home:
+        cuda_home = '/usr/local/cuda'
+        logger.info(
+            f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.'
+        )
+    nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"],
+                                          universal_newlines=True)
+    output = nvcc_output.split()
+    release_idx = output.index("release") + 1
+    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
+    return nvcc_cuda_version
+
+
+def _generate_random_fp8_e5m2(
+    tensor: torch.tensor,
+    low: float,
+    high: float,
+) -> None:
+    # NOTE(zhaoyang): Due to NaN and Inf representation for fp8 data type,
+    # it may occur Inf or NaN if we directly use torch.randint
+    # to generate random data for fp8 data.
+    # For example, s.11111.00 in fp8e5m2 format repesents Inf.
+    #     | E4M3        | E5M2
+    #-----|-------------|-------------------
+    # Inf | N/A         | s.11111.00
+    # NaN | s.1111.111  | s.11111.{01,10,11}
+    from vllm._C import cache_ops
+    tensor_tmp = torch.empty_like(tensor, dtype=torch.float16)
+    tensor_tmp.uniform_(low, high)
+    cache_ops.convert_fp8_e5m2(tensor_tmp, tensor)
+    del tensor_tmp
+
+
+def create_kv_caches_with_random(
+    num_blocks: int,
+    block_size: int,
+    num_layers: int,
+    num_heads: int,
+    head_size: int,
+    cache_dtype: Optional[Union[str, torch.dtype]],
+    model_dtype: Optional[Union[str, torch.dtype]] = None,
+    seed: Optional[int] = 0,
+    device: Optional[str] = "cuda",
+) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+
+    if isinstance(cache_dtype, str):
+        if cache_dtype == "auto":
+            if isinstance(model_dtype, str):
+                torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
+            elif isinstance(model_dtype, torch.dtype):
+                torch_dtype = model_dtype
+            else:
+                raise ValueError(f"Invalid model dtype: {model_dtype}")
+        elif cache_dtype in ["half", "bfloat16", "float"]:
+            torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype]
+        elif cache_dtype == "fp8_e5m2":
+            torch_dtype = torch.uint8
+        else:
+            raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
+    elif isinstance(cache_dtype, torch.dtype):
+        torch_dtype = cache_dtype
+    else:
+        raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
+
+    scale = head_size**-0.5
+    x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
+    key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
+    key_caches = []
+    for _ in range(num_layers):
+        key_cache = torch.empty(size=key_cache_shape,
+                                dtype=torch_dtype,
+                                device=device)
+        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+            key_cache.uniform_(-scale, scale)
+        elif cache_dtype == 'fp8_e5m2':
+            _generate_random_fp8_e5m2(key_cache, -scale, scale)
+        key_caches.append(key_cache)
+
+    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
+    value_caches = []
+    for _ in range(num_layers):
+        value_cache = torch.empty(size=value_cache_shape,
+                                  dtype=torch_dtype,
+                                  device=device)
+        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+            value_cache.uniform_(-scale, scale)
+        elif cache_dtype == 'fp8_e5m2':
+            _generate_random_fp8_e5m2(value_cache, -scale, scale)
+        value_caches.append(value_cache)
+    return key_caches, value_caches
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 1dd0243f8f3a3..f57e1ed75803d 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -6,7 +6,7 @@
 from vllm._C import cache_ops
 from vllm.config import CacheConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import in_wsl
+from vllm.utils import in_wsl, STR_DTYPE_TO_TORCH_DTYPE
 
 logger = init_logger(__name__)
 
@@ -34,12 +34,16 @@ def __init__(
         self.head_size = model_config.get_head_size()
         self.num_layers = model_config.get_num_layers(parallel_config)
         self.num_heads = model_config.get_num_kv_heads(parallel_config)
-        self.dtype = model_config.dtype
 
         self.block_size = cache_config.block_size
         self.num_gpu_blocks = cache_config.num_gpu_blocks
         self.num_cpu_blocks = cache_config.num_cpu_blocks
 
+        if cache_config.cache_dtype == "auto":
+            self.dtype = model_config.dtype
+        else:
+            self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
+
         # Initialize the cache.
         self.gpu_cache = self.allocate_gpu_cache()
         self.cpu_cache = self.allocate_cpu_cache()
@@ -142,6 +146,7 @@ def copy(self, src_to_dsts: Dict[int, List[int]]) -> None:
     @staticmethod
     def get_cache_block_size(
         block_size: int,
+        cache_dtype: str,
         model_config: ModelConfig,
         parallel_config: ParallelConfig,
     ) -> int:
@@ -152,7 +157,11 @@ def get_cache_block_size(
         key_cache_block = block_size * num_heads * head_size
         value_cache_block = key_cache_block
         total = num_layers * (key_cache_block + value_cache_block)
-        dtype_size = _get_dtype_size(model_config.dtype)
+        if cache_dtype == "auto":
+            dtype = model_config.dtype
+        else:
+            dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype]
+        dtype_size = _get_dtype_size(dtype)
         return dtype_size * total
 
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 60f5b71d35615..2a12152a70863 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -36,6 +36,7 @@ def __init__(
         parallel_config: ParallelConfig,
         scheduler_config: SchedulerConfig,
         lora_config: Optional[LoRAConfig],
+        kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
     ):
         self.model_config = model_config
@@ -68,6 +69,7 @@ def __init__(
         self.graph_block_tables = None  # Set after initial profiling.
         # cache in_wsl result
         self.in_wsl = in_wsl()
+        self.kv_cache_dtype = kv_cache_dtype
 
     def load_model(self) -> None:
         self.model = get_model(self.model_config, self.lora_config)
@@ -223,6 +225,7 @@ def _prepare_prompt(
             context_lens=context_lens_tensor,
             block_tables=block_tables,
             use_cuda_graph=False,
+            kv_cache_dtype=self.kv_cache_dtype,
         )
         return (input_tokens, input_positions, input_metadata, prompt_lens,
                 subquery_lens, lora_index_mapping, lora_prompt_mapping,
@@ -350,6 +353,7 @@ def _prepare_decode(
             context_lens=context_lens,
             block_tables=block_tables,
             use_cuda_graph=use_captured_graph,
+            kv_cache_dtype=self.kv_cache_dtype,
         )
         return input_tokens, input_positions, input_metadata, lora_index_mapping, lora_prompt_mapping, lora_requests
 
@@ -473,6 +477,7 @@ def prepare_input_tensors(
                 "context_lens": input_metadata.context_lens,
                 "block_tables": input_metadata.block_tables,
                 "use_cuda_graph": input_metadata.use_cuda_graph,
+                "kv_cache_dtype": input_metadata.kv_cache_dtype,
                 "selected_token_indices":
                 sampling_metadata.selected_token_indices,
                 "lora_requests": lora_requests,
@@ -495,6 +500,7 @@ def prepare_input_tensors(
                 context_lens=metadata_dict["context_lens"],
                 block_tables=metadata_dict["block_tables"],
                 use_cuda_graph=metadata_dict["use_cuda_graph"],
+                kv_cache_dtype=metadata_dict["kv_cache_dtype"],
             )
             sampling_metadata = SamplingMetadata(
                 seq_groups=None,
@@ -665,6 +671,7 @@ def capture_model(self, kv_caches: List[KVCache]) -> None:
                     context_lens=context_lens[:batch_size],
                     block_tables=block_tables[:batch_size],
                     use_cuda_graph=True,
+                    kv_cache_dtype=self.kv_cache_dtype,
                 )
 
                 if self.lora_config:
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index f1dad64b2b27a..a74adfa585611 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -37,6 +37,7 @@ def __init__(
         rank: int,
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
+        kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
     ) -> None:
         self.model_config = model_config
@@ -54,6 +55,7 @@ def __init__(
                                         parallel_config,
                                         scheduler_config,
                                         lora_config=self.lora_config,
+                                        kv_cache_dtype=kv_cache_dtype,
                                         is_driver_worker=is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
         # self.init_cache_engine().
@@ -95,6 +97,7 @@ def profile_num_available_blocks(
         block_size: int,
         gpu_memory_utilization: float,
         cpu_swap_space: int,
+        cache_dtype: str,
     ) -> Tuple[int, int]:
         """Profiles the peak memory usage of the model and returns the maximum
         number of GPU and CPU cache blocks that can be allocated.
@@ -119,7 +122,7 @@ def profile_num_available_blocks(
         peak_memory = total_gpu_memory - free_gpu_memory
 
         cache_block_size = CacheEngine.get_cache_block_size(
-            block_size, self.model_config, self.parallel_config)
+            block_size, cache_dtype, self.model_config, self.parallel_config)
         num_gpu_blocks = int(
             (total_gpu_memory * gpu_memory_utilization - peak_memory) //
             cache_block_size)

From b72af8f1eded6f5838be29eb6093ab0e0e0c240c Mon Sep 17 00:00:00 2001
From: zhaoyang-star <zhaoyangstar@foxmail.com>
Date: Mon, 29 Jan 2024 14:47:39 +0800
Subject: [PATCH 012/112] Fix error when tp > 1 (#2644)

Co-authored-by: zhaoyang-star <zhao.yang16@zte.com.cn>
---
 vllm/engine/llm_engine.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 5b73ef08f9d84..0d836a1fb13a9 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -236,7 +236,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         model_config = copy.deepcopy(self.model_config)
         parallel_config = copy.deepcopy(self.parallel_config)
         scheduler_config = copy.deepcopy(self.scheduler_config)
-        cache_config = copy.deepcopy(self.cache_config)
 
         for rank, (worker, (node_id,
                             _)) in enumerate(zip(self.workers,
@@ -252,7 +251,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     rank,
                     distributed_init_method,
                     lora_config=self.lora_config,
-                    cache_config=cache_config,
+                    kv_cache_dtype=self.cache_config.cache_dtype,
                 ))
 
         driver_rank = 0
@@ -265,7 +264,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             driver_rank,
             distributed_init_method,
             lora_config=self.lora_config,
-            cache_config=cache_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=True,
         )
 

From 1b20639a43e811f4469e3cfa543cf280d0d76265 Mon Sep 17 00:00:00 2001
From: Hanzhi Zhou <hanzhi713@163.com>
Date: Tue, 30 Jan 2024 02:46:29 +0800
Subject: [PATCH 013/112] No repeated IPC open (#2642)

---
 csrc/custom_all_reduce.cuh | 43 ++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
index 6e71bb9a9c6e8..54409e19eb455 100644
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -7,6 +7,7 @@
 
 #include <iostream>
 #include <limits>
+#include <map>
 #include <unordered_map>
 #include <vector>
 
@@ -327,6 +328,10 @@ __global__ void __launch_bounds__(512, 1)
   }
 }
 
+using IPC_KEY = std::array<uint8_t, sizeof(cudaIpcMemHandle_t)>;
+static_assert(sizeof(IPC_KEY) == sizeof(cudaIpcMemHandle_t));
+static_assert(alignof(IPC_KEY) == alignof(cudaIpcMemHandle_t));
+
 class CustomAllreduce {
  public:
   int rank_;
@@ -341,7 +346,8 @@ class CustomAllreduce {
   // stores the registered device pointers from all ranks
   RankData *d_rank_data_base_, *d_rank_data_end_;
   std::vector<void *> graph_unreg_buffers_;
-  std::vector<void *> ipc_handles_;
+  // a map from IPC handles to opened IPC pointers
+  std::map<IPC_KEY, char *> ipc_handles_;
 
   /**
    * meta is a pointer to device metadata and temporary buffer for allreduce.
@@ -365,10 +371,7 @@ class CustomAllreduce {
     for (int i = 0; i < world_size_; i++) {
       Metadata *rank_meta;
       if (i != rank_) {
-        char *handle;
-        CUDACHECK(cudaIpcOpenMemHandle((void **)&handle, handles[i],
-                                       cudaIpcMemLazyEnablePeerAccess));
-        ipc_handles_.push_back(handle);
+        char *handle = open_ipc_handle(&handles[i]);
         handle += offsets[i];
         rank_meta = (Metadata *)handle;
       } else {
@@ -378,6 +381,19 @@ class CustomAllreduce {
     }
   }
 
+  char *open_ipc_handle(const void *ipc_handle) {
+    auto [it, new_handle] =
+        ipc_handles_.insert({*((IPC_KEY *)ipc_handle), nullptr});
+    if (new_handle) {
+      char *ipc_ptr;
+      CUDACHECK(cudaIpcOpenMemHandle((void **)&ipc_ptr,
+                                     *((const cudaIpcMemHandle_t *)ipc_handle),
+                                     cudaIpcMemLazyEnablePeerAccess));
+      it->second = ipc_ptr;
+    }
+    return it->second;
+  }
+
   std::pair<std::vector<uint8_t>, std::vector<int64_t>>
   get_graph_buffer_ipc_meta() {
     auto num_buffers = graph_unreg_buffers_.size();
@@ -413,11 +429,7 @@ class CustomAllreduce {
     RankData data;
     for (int i = 0; i < world_size_; i++) {
       if (i != rank_) {
-        char *handle;
-        CUDACHECK(cudaIpcOpenMemHandle(
-            (void **)&handle, *((const cudaIpcMemHandle_t *)handles[i].data()),
-            cudaIpcMemLazyEnablePeerAccess));
-        ipc_handles_.push_back(handle);
+        char *handle = open_ipc_handle(handles[i].data());
         handle += offsets[i];
         data.ptrs[i] = handle;
       } else {
@@ -448,13 +460,8 @@ class CustomAllreduce {
       auto &rd = rank_data[i];
       for (int j = 0; j < world_size_; j++) {
         if (j != rank_) {
-          char *handle;
-          CUDACHECK(cudaIpcOpenMemHandle(
-              (void **)&handle,
-              *((cudaIpcMemHandle_t *)&handles[j]
-                                              [i * sizeof(cudaIpcMemHandle_t)]),
-              cudaIpcMemLazyEnablePeerAccess));
-          ipc_handles_.push_back(handle);
+          char *handle =
+              open_ipc_handle(&handles[j][i * sizeof(cudaIpcMemHandle_t)]);
           handle += offsets[j][i];
           rd.ptrs[j] = handle;
         } else {
@@ -541,7 +548,7 @@ class CustomAllreduce {
   }
 
   ~CustomAllreduce() {
-    for (auto ptr : ipc_handles_) {
+    for (auto [_, ptr] : ipc_handles_) {
       CUDACHECK(cudaIpcCloseMemHandle(ptr));
     }
   }

From ea8489fce266d69f2fbe314c1385956b1a342e12 Mon Sep 17 00:00:00 2001
From: Rasmus Larsen <rlarsen@pm.me>
Date: Mon, 29 Jan 2024 19:52:31 +0100
Subject: [PATCH 014/112] ROCm: Allow setting compilation target (#2581)

---
 setup.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index b552cb67cb363..8fad433f98b09 100644
--- a/setup.py
+++ b/setup.py
@@ -287,11 +287,15 @@ def get_torch_arch_list() -> Set[str]:
                 },
             ))
 elif _is_hip():
-    amd_arch = get_amdgpu_offload_arch()
-    if amd_arch not in ROCM_SUPPORTED_ARCHS:
-        raise RuntimeError(
-            f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}"
-            f"amdgpu_arch_found: {amd_arch}")
+    amd_archs = os.getenv("GPU_ARCHS")
+    if amd_archs is None:
+        amd_archs = get_amdgpu_offload_arch()
+    for arch in amd_archs.split(";"):
+        if arch not in ROCM_SUPPORTED_ARCHS:
+            raise RuntimeError(
+                f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}"
+                f"amdgpu_arch_found: {arch}")
+        NVCC_FLAGS += [f"--offload-arch={arch}"]
 
 elif _is_neuron():
     neuronxcc_version = get_neuronxcc_version()

From 5d60def02cb5a43fa5864fcb123909b101df9ec5 Mon Sep 17 00:00:00 2001
From: wangding zeng <155410488+zwd003@users.noreply.github.com>
Date: Tue, 30 Jan 2024 13:19:48 +0800
Subject: [PATCH 015/112] DeepseekMoE support with Fused MoE kernel (#2453)

Co-authored-by: roy <jasonailu87@gmail.com>
---
 csrc/dispatch_utils.h                   |  11 +
 csrc/moe_align_block_size_kernels.cu    | 108 ++++++
 csrc/ops.h                              |   9 +
 csrc/pybind.cpp                         |   4 +
 setup.py                                |   1 +
 tests/kernels/test_fused_moe.py         |  50 +++
 vllm/model_executor/layers/fused_moe.py | 287 +++++++++++++++
 vllm/model_executor/models/__init__.py  |   1 +
 vllm/model_executor/models/deepseek.py  | 453 ++++++++++++++++++++++++
 9 files changed, 924 insertions(+)
 create mode 100644 csrc/moe_align_block_size_kernels.cu
 create mode 100644 tests/kernels/test_fused_moe.py
 create mode 100644 vllm/model_executor/layers/fused_moe.py
 create mode 100644 vllm/model_executor/models/deepseek.py

diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index 85fdfc091e94c..91abd9e85b4bb 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -24,3 +24,14 @@
 #define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...)           \
   AT_DISPATCH_SWITCH(                                                    \
     TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__))
+    
+#define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...)             \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)      \
+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)      \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)     \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
+
+#define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...)             \
+  AT_DISPATCH_SWITCH(                                             \
+    TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
diff --git a/csrc/moe_align_block_size_kernels.cu b/csrc/moe_align_block_size_kernels.cu
new file mode 100644
index 0000000000000..81cc6dd6349d0
--- /dev/null
+++ b/csrc/moe_align_block_size_kernels.cu
@@ -0,0 +1,108 @@
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+
+const static size_t NUM_MAX_EXPERTS = 64;
+#define CEILDIV(x,y) (((x) + (y) - 1) / (y))
+
+namespace vllm {
+template <typename scalar_t>
+__global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, 
+                                int32_t *sorted_token_ids, 
+                                int32_t *expert_ids, 
+                                int32_t *total_tokens_post_pad,
+                                int32_t num_experts, 
+                                int32_t block_size, 
+                                size_t numel) {
+    const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
+    const size_t start_idx = threadIdx.x * tokens_per_thread;
+    __shared__ int32_t tokens_cnts[NUM_MAX_EXPERTS + 1][NUM_MAX_EXPERTS];
+    __shared__ int32_t cumsum[NUM_MAX_EXPERTS + 1];
+    for (int i = 0; i < num_experts; ++i) {
+        tokens_cnts[threadIdx.x + 1][i] = 0;
+    }
+
+    /**
+    * In the first step we compute token_cnts[thread_index + 1][expert_index],
+    * which counts how many tokens in the token shard of thread_index are assigned
+    * to expert expert_index.
+    */
+    for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+        ++tokens_cnts[threadIdx.x + 1][topk_ids[i]]; 
+    }
+
+    __syncthreads();
+
+    // For each expert we accumulate the token counts from the different threads.
+    tokens_cnts[0][threadIdx.x] = 0;
+    for (int i = 1; i <= blockDim.x; ++i) {
+        tokens_cnts[i][threadIdx.x] += tokens_cnts[i-1][threadIdx.x];
+    }
+
+    __syncthreads();
+    
+    // We accumulate the token counts of all experts in thread 0.
+    if (threadIdx.x == 0) {
+        cumsum[0] = 0;
+        for (int i = 1; i <= num_experts; ++i) {
+            cumsum[i] = cumsum[i-1] + CEILDIV(tokens_cnts[blockDim.x][i - 1], block_size) * block_size;
+        }
+        *total_tokens_post_pad = cumsum[num_experts];
+    }
+
+    __syncthreads();
+
+    /**
+    * For each expert, each thread processes the tokens of the corresponding blocks
+    * and stores the corresponding expert_id for each block.
+    */
+    for (int i = cumsum[threadIdx.x];i < cumsum[threadIdx.x + 1];i += block_size) {
+        expert_ids[i / block_size] = threadIdx.x;
+    }
+    
+    /**
+    * Each thread processes a token shard, calculating the index of each token after
+    * sorting by expert number. Given the example topk_ids = [0,1,2,1,2,3,0,3,4] and
+    * block_size = 4, then the output would be [0, 6, *, *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *],
+    * where * represents a padding value(preset in python).
+    */
+    for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+        int32_t expert_id = topk_ids[i];
+        /** The cumsum[expert_id] stores the starting index of the tokens that the
+        * expert with expert_id needs to process, and tokens_cnts[threadIdx.x][expert_id]
+        * stores the indices of the tokens processed by the expert with expert_id within
+        * the current thread's token shard.
+        */
+        int32_t rank_post_pad = tokens_cnts[threadIdx.x][expert_id] + cumsum[expert_id];
+        sorted_token_ids[rank_post_pad] = i;
+        ++tokens_cnts[threadIdx.x][expert_id];
+    }
+}
+}
+
+void moe_align_block_size(
+    torch::Tensor topk_ids,
+    int num_experts,
+    int block_size,
+    torch::Tensor sorted_token_ids,
+    torch::Tensor experts_ids,
+    torch::Tensor num_tokens_post_pad) {
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    assert(num_experts <= NUM_MAX_EXPERTS);
+    VLLM_DISPATCH_INTEGRAL_TYPES(
+        topk_ids.scalar_type(), "moe_alig_block_size_kernel", [&] {
+        vllm::moe_align_block_size_kernel<scalar_t><<<1, num_experts, 0, stream>>>(
+            topk_ids.data_ptr<scalar_t>(), 
+            sorted_token_ids.data_ptr<int32_t>(), 
+            experts_ids.data_ptr<int32_t>(), 
+            num_tokens_post_pad.data_ptr<int32_t>(), 
+            num_experts,
+            block_size,
+            topk_ids.numel());
+    });
+}
diff --git a/csrc/ops.h b/csrc/ops.h
index ce77dd47d3550..6e52dd81bc517 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -121,3 +121,12 @@ std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(
 void register_graph_buffers(fptr_t _fa, const std::vector<std::string> &handles,
                             const std::vector<std::vector<int64_t>> &offsets);
 #endif
+
+void moe_align_block_size(
+  torch::Tensor topk_ids,
+  int num_experts,
+  int block_size,
+  torch::Tensor sorted_token_ids,
+  torch::Tensor experts_ids,
+  torch::Tensor num_tokens_post_pad
+  );
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index db2da8f06bcf0..a8a998830e868 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -56,6 +56,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
   ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
   ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
+  ops.def(
+      "moe_align_block_size",
+      &moe_align_block_size,
+      "Aligning the number of tokens to be processed by each expert such that it is divisible by the block size.");
 
   // Cache ops
   pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
diff --git a/setup.py b/setup.py
index 8fad433f98b09..3e2127855a755 100644
--- a/setup.py
+++ b/setup.py
@@ -309,6 +309,7 @@ def get_torch_arch_list() -> Set[str]:
     "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
     "csrc/quantization/gptq/q_gemm.cu",
     "csrc/cuda_utils_kernels.cu",
+    "csrc/moe_align_block_size_kernels.cu",
     "csrc/pybind.cpp",
 ]
 
diff --git a/tests/kernels/test_fused_moe.py b/tests/kernels/test_fused_moe.py
new file mode 100644
index 0000000000000..80a0349d6575b
--- /dev/null
+++ b/tests/kernels/test_fused_moe.py
@@ -0,0 +1,50 @@
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.activation import SiluAndMul
+
+
+def torch_moe(a, w1, w2, topk_weight, topk_ids):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk_ids.shape[1], 1).reshape(-1, D)
+    out = torch.zeros(B * topk_ids.shape[1],
+                      w2.shape[1],
+                      dtype=a.dtype,
+                      device=a.device)
+    topk_ids = topk_ids.view(-1)
+    topk_weight = topk_weight.view(-1)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = SiluAndMul()(
+                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1)).sum(dim=1)
+
+
+@pytest.mark.parametrize("m", [512, 222, 33, 1])
+@pytest.mark.parametrize("n", [2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("e", [8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_fused_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+):
+    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
+
+    score = torch.randn((m, e), device='cuda', dtype=dtype)
+    score = torch.softmax(score, dim=-1)
+    topk_weight, topk_ids = torch.topk(score, topk)
+
+    triton_output = fused_moe(a, w1, w2, topk_weight, topk_ids, False)
+    torch_output = torch_moe(a, w1, w2, topk_weight, topk_ids)
+    assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
diff --git a/vllm/model_executor/layers/fused_moe.py b/vllm/model_executor/layers/fused_moe.py
new file mode 100644
index 0000000000000..998062d82d1f0
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe.py
@@ -0,0 +1,287 @@
+"""Fused MoE kernel."""
+import torch
+import triton
+import triton.language as tl
+
+from vllm._C import ops
+
+
+@triton.jit
+def fused_moe_kernel(
+    # Pointers to matrices
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    topk_weights_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    num_tokens_post_padded_ptr,
+    # Matrix dimensions
+    N,
+    K,
+    EM,
+    num_valid_tokens,
+    # The stride variables represent how much to increase the ptr by when moving by 1
+    # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr`
+    # by to get the element one row down (A has M rows).
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    MUL_ROUTED_WEIGHT: tl.constexpr,
+    top_k: tl.constexpr,
+    compute_type: tl.constexpr,
+):
+    """
+    Implements the fused computation for a Mixture of Experts (MOE) using token and expert matrices.
+
+    Key Parameters:
+    - A: The input tensor representing tokens with shape (*, K), where '*' can be any shape representing batches and K is the feature dimension of each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is the number of experts, K is the input feature dimension, and N is the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the total number of tokens post padding, topk is the number of times each token is repeated,
+        and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens, repeated topk times and arranged by the expert index they are assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each block. It determines which expert matrix from B should be used for each block in A.
+    This kernel performs the multiplication of a token by its corresponding expert matrix as determined by `expert_ids`. The sorting of `sorted_token_ids`
+    by expert index and padding ensures divisibility by BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix multiplication across different blocks processed by the same expert.
+    """
+    # -----------------------------------------------------------
+    # Map program ids `pid` to the block of C it should compute.
+    # This is done in a grouped ordering to promote L2 data reuse.
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # ----------------------------------------------------------
+    # Create pointers for the first blocks of A and B.
+    # We will advance this pointer as we move in the K direction
+    # and accumulate
+    # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+    # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+        return
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+    token_mask = offs_token < num_valid_tokens
+
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
+                      offs_k[None, :] * stride_ak)
+
+    off_experts = tl.load(expert_ids_ptr + pid_m)
+    b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
+                                                offs_bn[None, :] * stride_bn)
+
+    # -----------------------------------------------------------
+    # Iterate to compute a block of the C matrix.
+    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+    # of fp32 values for higher accuracy.
+    # `accumulator` will be converted back to fp16 after the loop.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # Load the next block of A and B, generate a mask by checking the K dimension.
+        a = tl.load(a_ptrs,
+                    mask=token_mask[:, None] &
+                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        # We accumulate along the K dimension.
+        accumulator += tl.dot(a, b)
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token,
+                             mask=token_mask,
+                             other=0)
+        accumulator = accumulator * moe_weight[:, None]
+
+    accumulator = accumulator.to(compute_type)
+    # -----------------------------------------------------------
+    # Write back the block of the output
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
+        None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+def moe_align_block_size(
+        topk_ids: torch.Tensor, block_size: int,
+        num_experts: int) -> (torch.Tensor, torch.Tensor, torch.Tensor):
+    """
+    Aligns the token distribution across experts to be compatible with block size for matrix multiplication.
+
+    Parameters:
+    - topk_ids: A tensor of shape [total_tokens, top_k] representing the top-k expert indices for each token.
+    - block_size: The block size used in block matrix multiplication.
+    - num_experts: The total number of experts.
+
+    Returns:
+    - sorted_token_ids: A tensor containing the sorted token indices according to their allocated expert.
+    - expert_ids: A tensor indicating the assigned expert index for each block.
+    - num_tokens_post_padded: The total number of tokens after padding, ensuring divisibility by block_size.
+
+    This function pads the number of tokens that each expert needs to process so that it is divisible by block_size. 
+    Padding ensures that during block matrix multiplication, the dimensions align correctly.
+
+    Example:
+    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]], block_size = 4, and num_experts = 4:
+    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts, with each expert needing to process 3 tokens.
+    - As block_size is 4, we pad 1 token for each expert.
+    - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
+    - Then append padding tokens [12, 12, 12, 12] for each block.
+    - After sorting by expert index, we obtain token_ids [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12]. 
+        Tokens 12 are non-existent (padding) and are ignored in the subsequent matrix multiplication.
+    - The padding ensures that the total number of tokens is now divisible by block_size for proper block matrix operations.
+    """
+    sorted_ids = torch.empty(
+        (topk_ids.numel() + num_experts * (block_size - 1), ),
+        dtype=torch.int32,
+        device=topk_ids.device)
+    expert_ids = torch.empty((topk_ids.numel() + num_experts, ),
+                             dtype=torch.int32,
+                             device=topk_ids.device)
+    sorted_ids.fill_(topk_ids.numel())
+    num_tokens_post_pad = torch.empty((1),
+                                      dtype=torch.int32,
+                                      device=topk_ids.device)
+    ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
+                             expert_ids, num_tokens_post_pad)
+    return sorted_ids, expert_ids, num_tokens_post_pad
+
+
+def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
+                            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                            sorted_token_ids: torch.Tensor,
+                            expert_ids: torch.Tensor,
+                            num_tokens_post_padded: torch.Tensor,
+                            mul_routed_weight: bool, top_k: int, config: dict):
+
+    assert topk_weights.stride(1) == 1
+    assert sorted_token_ids.stride(0) == 1
+
+    grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[
+        'BLOCK_SIZE_M']) * triton.cdiv(B.shape[1], META['BLOCK_SIZE_N']), )
+
+    fused_moe_kernel[grid](
+        A,
+        B,
+        C,
+        topk_weights,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        B.shape[1],
+        B.shape[2],
+        sorted_token_ids.shape[0],
+        topk_ids.numel(),
+        A.stride(0),
+        A.stride(1),
+        B.stride(0),
+        B.stride(2),
+        B.stride(1),
+        C.stride(1),
+        C.stride(2),
+        MUL_ROUTED_WEIGHT=mul_routed_weight,
+        top_k=top_k,
+        compute_type=tl.bfloat16 if A.dtype == torch.bfloat16 else tl.float16,
+        **config,
+    )
+
+
+def fused_moe(hidden_states: torch.Tensor,
+              w1: torch.Tensor,
+              w2: torch.Tensor,
+              topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor,
+              inplace=False):
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2, and top-k gating mechanism.
+    
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - topk_weights (torch.Tensor): The weights for the top-k selected experts.
+    - topk_ids (torch.Tensor): The indices of the top-k selected experts.
+    - inplace (bool): If True, perform the operation in-place. Defaults to False.
+    
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[1] == w1.shape[2], "Incompatible dimensions"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype in [torch.float16, torch.bfloat16]
+    M, _ = hidden_states.shape
+    E, N, _ = w1.shape
+
+    config = {
+        'BLOCK_SIZE_M': 64,
+        'BLOCK_SIZE_N': 64,
+        'BLOCK_SIZE_K': 32,
+        'GROUP_SIZE_M': 8
+    }
+
+    if topk_ids.numel() <= w1.shape[0]:
+        config = {
+            'BLOCK_SIZE_M': 16,
+            'BLOCK_SIZE_N': 32,
+            'BLOCK_SIZE_K': 64,
+            'GROUP_SIZE_M': 1
+        }
+
+    intermediate_cache1 = torch.empty((M, topk_ids.shape[1], N),
+                                      device=hidden_states.device,
+                                      dtype=hidden_states.dtype)
+    intermediate_cache2 = torch.empty((M * topk_ids.shape[1], N // 2),
+                                      device=hidden_states.device,
+                                      dtype=hidden_states.dtype)
+    intermediate_cache3 = torch.empty((M, topk_ids.shape[1], w2.shape[1]),
+                                      device=hidden_states.device,
+                                      dtype=hidden_states.dtype)
+
+    sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+        topk_ids, config['BLOCK_SIZE_M'], E)
+
+    invoke_fused_moe_kernel(hidden_states, w1, intermediate_cache1,
+                            topk_weights, topk_ids, sorted_token_ids,
+                            expert_ids, num_tokens_post_padded, False,
+                            topk_ids.shape[1], config)
+
+    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
+
+    invoke_fused_moe_kernel(intermediate_cache2, w2, intermediate_cache3,
+                            topk_weights, topk_ids, sorted_token_ids,
+                            expert_ids, num_tokens_post_padded, True, 1,
+                            config)
+
+    if inplace:
+        return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
+                         dim=1,
+                         out=hidden_states)
+    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
+                     dim=1)
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index b1d74b5169ba0..93631d260abcb 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -18,6 +18,7 @@
     "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
     "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
+    "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
     "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
new file mode 100644
index 0000000000000..fc727b8e661b3
--- /dev/null
+++ b/vllm/model_executor/models/deepseek.py
@@ -0,0 +1,453 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Deepseek model."""
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               MergedColumnParallelLinear,
+                                               ReplicatedLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
+from vllm.model_executor.parallel_utils.communication_op import (
+    tensor_model_parallel_all_reduce)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.weight_utils import (default_weight_loader,
+                                              hf_model_weights_iterator)
+from vllm.sequence import SamplerOutput
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class DeepseekMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        linear_method: Optional[LinearMethodBase] = None,
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            linear_method=linear_method)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           linear_method=linear_method,
+                                           reduce_results=reduce_results)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class DeepseekMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.n_routed_experts = config.n_routed_experts
+        self.top_k = config.num_experts_per_tok
+        if self.tp_size > self.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {self.n_routed_experts}.")
+
+        self.experts = nn.ModuleList([
+            DeepseekMLP(hidden_size=config.hidden_size,
+                        intermediate_size=config.moe_intermediate_size,
+                        hidden_act=config.hidden_act,
+                        linear_method=linear_method,
+                        reduce_results=False)
+            for idx in range(self.n_routed_experts)
+        ])
+        self.pack_params()
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     self.n_routed_experts,
+                                     bias=False,
+                                     linear_method=None)
+
+        if config.n_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = DeepseekMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                linear_method=linear_method,
+                reduce_results=False,
+            )
+
+    def pack_params(self):
+        w1 = []
+        w2 = []
+        for expert in self.experts:
+            w1.append(expert.gate_up_proj.weight)
+            w2.append(expert.down_proj.weight)
+        self.w1 = torch._utils._flatten_dense_tensors(w1)
+        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
+        for data, param in zip(w1s, w1):
+            param.data = data
+        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
+
+        self.w2 = torch._utils._flatten_dense_tensors(w2)
+        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
+        for data, param in zip(w2s, w2):
+            param.data = data
+
+        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.config.n_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights,
+                                                       self.top_k,
+                                                       dim=-1)
+
+        if self.config.norm_topk_prob:
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+        final_hidden_states = fused_moe(hidden_states,
+                                        self.w1,
+                                        self.w2,
+                                        routing_weights,
+                                        selected_experts,
+                                        inplace=True)
+
+        if self.config.n_shared_experts is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        final_hidden_states = tensor_model_parallel_all_reduce(
+            final_hidden_states)
+
+        return final_hidden_states.view(batch_size, sequence_length,
+                                        hidden_dim)
+
+
+class DeepseekAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            linear_method=linear_method,
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            linear_method=linear_method,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   self.scaling,
+                                   num_kv_heads=self.num_kv_heads)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class DeepseekDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_idx: int,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = DeepseekAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            linear_method=linear_method,
+        )
+        if (config.n_routed_experts is not None and  \
+            layer_idx >= config.first_k_dense_replace and layer_idx % config.moe_layer_freq == 0):
+            self.mlp = DeepseekMoE(config=config, linear_method=linear_method)
+        else:
+            self.mlp = DeepseekMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                linear_method=linear_method,
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class DeepseekModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            DeepseekDecoderLayer(config,
+                                 layer_idx,
+                                 linear_method=linear_method)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states,
+                                            kv_caches[i], input_metadata,
+                                            residual)
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class DeepseekForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = DeepseekModel(config, linear_method)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   sampling_metadata)
+        return next_tokens
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     load_format: str = "auto",
+                     revision: Optional[str] = None):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path,
+                cache_dir,
+                load_format,
+                revision,
+                fall_back_to_pt=False):
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                        and name not in params_dict):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                        and name not in params_dict):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)

From ab406446691f289ef51d1abd8d1ff66760eda36f Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Mon, 29 Jan 2024 22:43:37 -0800
Subject: [PATCH 016/112] Fused MOE for Mixtral (#2542)

Co-authored-by: chen shen <scv119@gmail.com>
---
 csrc/moe_align_block_size_kernels.cu  |   2 +-
 csrc/ops.h                            |  16 +--
 csrc/pybind.cpp                       |   6 +-
 vllm/model_executor/models/mixtral.py | 200 +++++++++++++-------------
 4 files changed, 115 insertions(+), 109 deletions(-)

diff --git a/csrc/moe_align_block_size_kernels.cu b/csrc/moe_align_block_size_kernels.cu
index 81cc6dd6349d0..de6a0ec0a972c 100644
--- a/csrc/moe_align_block_size_kernels.cu
+++ b/csrc/moe_align_block_size_kernels.cu
@@ -95,7 +95,7 @@ void moe_align_block_size(
     const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
     assert(num_experts <= NUM_MAX_EXPERTS);
     VLLM_DISPATCH_INTEGRAL_TYPES(
-        topk_ids.scalar_type(), "moe_alig_block_size_kernel", [&] {
+        topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
         vllm::moe_align_block_size_kernel<scalar_t><<<1, num_experts, 0, stream>>>(
             topk_ids.data_ptr<scalar_t>(), 
             sorted_token_ids.data_ptr<int32_t>(), 
diff --git a/csrc/ops.h b/csrc/ops.h
index 6e52dd81bc517..2bcd0c2efc5c6 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -100,6 +100,13 @@ void gptq_shuffle(
   torch::Tensor q_weight,
   torch::Tensor q_perm);
 
+void moe_align_block_size(
+  torch::Tensor topk_ids,
+  int num_experts,
+  int block_size,
+  torch::Tensor sorted_token_ids,
+  torch::Tensor experts_ids,
+  torch::Tensor num_tokens_post_pad);
 
 #ifndef USE_ROCM
 using fptr_t = uint64_t;
@@ -121,12 +128,3 @@ std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(
 void register_graph_buffers(fptr_t _fa, const std::vector<std::string> &handles,
                             const std::vector<std::vector<int64_t>> &offsets);
 #endif
-
-void moe_align_block_size(
-  torch::Tensor topk_ids,
-  int num_experts,
-  int block_size,
-  torch::Tensor sorted_token_ids,
-  torch::Tensor experts_ids,
-  torch::Tensor num_tokens_post_pad
-  );
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index a8a998830e868..8a8235691ab8e 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -57,9 +57,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
   ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
   ops.def(
-      "moe_align_block_size",
-      &moe_align_block_size,
-      "Aligning the number of tokens to be processed by each expert such that it is divisible by the block size.");
+    "moe_align_block_size",
+    &moe_align_block_size,
+    "Aligning the number of tokens to be processed by each expert such that it is divisible by the block size.");
 
   // Cache ops
   pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index a8dadce24aa1d..f36c35fd27ad5 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -23,8 +23,6 @@
 """Inference-only Mixtral model."""
 from typing import List, Optional, Tuple
 
-import numpy as np
-
 import torch
 import torch.nn.functional as F
 
@@ -33,10 +31,11 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               ReplicatedLinear,
                                                QKVParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
@@ -47,6 +46,7 @@
 from vllm.model_executor.parallel_utils.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_weight_attrs
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@@ -54,85 +54,77 @@
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
 
-class MixtralMLP(nn.Module):
+class MixtralMoE(nn.Module):
+    """A tensor-parallel MoE implementation for Mixtral that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
 
     def __init__(
         self,
         num_experts: int,
+        top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
+        params_dtype: Optional[torch.dtype] = None,
+    ):
         super().__init__()
-        self.num_experts = num_experts
-        self.ffn_dim = intermediate_size
-        self.hidden_dim = hidden_size
-
-        self.w1 = ReplicatedLinear(self.hidden_dim,
-                                   self.ffn_dim,
-                                   bias=False,
-                                   linear_method=linear_method)
-        self.w2 = ReplicatedLinear(self.ffn_dim,
-                                   self.hidden_dim,
-                                   bias=False,
-                                   linear_method=linear_method)
-        self.w3 = ReplicatedLinear(self.hidden_dim,
-                                   self.ffn_dim,
-                                   bias=False,
-                                   linear_method=linear_method)
-
-        # TODO: Use vllm's SiluAndMul
-        self.act_fn = nn.SiLU()
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        w1_out, _ = self.w1(hidden_states)
-        w1_out = self.act_fn(w1_out)
-        w3_out, _ = self.w3(hidden_states)
-        current_hidden_states = w1_out * w3_out
-        current_hidden_states, _ = self.w2(current_hidden_states)
-        return current_hidden_states
-
+        tp_size = get_tensor_model_parallel_world_size()
+        self.num_total_experts = num_experts
+        self.top_k = top_k
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size // tp_size
 
-class MixtralMoE(nn.Module):
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
 
-    def __init__(
-        self,
-        config: MixtralConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.num_total_experts = config.num_local_experts
-        self.top_k = config.num_experts_per_tok
-        if self.tp_size > self.num_total_experts:
-            raise ValueError(
-                f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {self.num_total_experts}.")
-        # Split experts equally between ranks
-        self.expert_indicies = np.array_split(range(
-            self.num_total_experts), self.tp_size)[self.rank].tolist()
-        if not self.expert_indicies:
-            raise ValueError(
-                f"Rank {self.rank} has no experts assigned to it.")
-
-        self.experts = nn.ModuleList([
-            MixtralMLP(self.num_total_experts,
-                       config.hidden_size,
-                       config.intermediate_size,
-                       linear_method=linear_method)
-            if idx in self.expert_indicies else None
-            for idx in range(self.num_total_experts)
-        ])
-        self.gate = ReplicatedLinear(config.hidden_size,
+        self.gate = ReplicatedLinear(self.hidden_size,
                                      self.num_total_experts,
                                      bias=False,
+                                     params_dtype=self.params_dtype,
                                      linear_method=None)
 
+        self.ws = nn.Parameter(
+            torch.empty(self.num_total_experts,
+                        2 * self.intermediate_size,
+                        self.hidden_size,
+                        device="cuda",
+                        dtype=self.params_dtype))
+        self.w2s = nn.Parameter(
+            torch.empty(self.num_total_experts,
+                        self.hidden_size,
+                        self.intermediate_size,
+                        device="cuda",
+                        dtype=self.params_dtype))
+
+        set_weight_attrs(self.ws, {
+            "weight_loader": self.weight_loader,
+        })
+        set_weight_attrs(self.w2s, {
+            "weight_loader": self.weight_loader,
+        })
+
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
+                      weight_name: str, expert_id: int):
+        tp_rank = get_tensor_model_parallel_rank()
+        param_data = param.data
+        shard_size = self.intermediate_size
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        if weight_name.endswith("w1.weight"):
+            param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
+        if weight_name.endswith("w3.weight"):
+            param_data[expert_id,
+                       shard_size:2 * shard_size, :] = loaded_weight[shard, :]
+        if weight_name.endswith("w2.weight"):
+            param_data[expert_id, :, :] = loaded_weight[:, shard]
+
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        batch_size, sequence_length, hidden_dim = hidden_states.shape
-        hidden_states = hidden_states.view(-1, hidden_dim)
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (batch * sequence_length, n_experts)
         router_logits, _ = self.gate(hidden_states)
 
@@ -142,22 +134,18 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                                                        dim=-1)
         routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
 
-        final_hidden_states = None
-        for expert_idx in self.expert_indicies:
-            expert_layer = self.experts[expert_idx]
-            expert_mask = (selected_experts == expert_idx)
-            expert_weights = (routing_weights * expert_mask).sum(dim=-1,
-                                                                 keepdim=True)
-
-            current_hidden_states = expert_layer(hidden_states).mul_(
-                expert_weights)
-            if final_hidden_states is None:
-                final_hidden_states = current_hidden_states
-            else:
-                final_hidden_states.add_(current_hidden_states)
+        final_hidden_states = fused_moe(hidden_states,
+                                        self.ws,
+                                        self.w2s,
+                                        routing_weights,
+                                        selected_experts,
+                                        inplace=True)
+
+        final_hidden_states = tensor_model_parallel_all_reduce(
+            final_hidden_states)
 
-        return tensor_model_parallel_all_reduce(final_hidden_states).view(
-            batch_size, sequence_length, hidden_dim)
+        return final_hidden_states.view(batch_size, sequence_length,
+                                        hidden_size)
 
 
 class MixtralAttention(nn.Module):
@@ -257,8 +245,11 @@ def __init__(
             rope_theta=rope_theta,
             sliding_window=config.sliding_window,
             linear_method=linear_method)
-        self.block_sparse_moe = MixtralMoE(config=config,
-                                           linear_method=linear_method)
+        self.block_sparse_moe = MixtralMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size)
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
@@ -378,6 +369,14 @@ def load_weights(self,
             ("qkv_proj", "v_proj", "v"),
         ]
 
+        expert_params_mapping = [
+            # (param_name, weight_name, expert_id)
+            ("ws" if weight_name in ["w1", "w3"] else "w2s",
+             f"experts.{expert_id}.{weight_name}.weight", expert_id)
+            for expert_id in range(self.config.num_local_experts)
+            for weight_name in ["w1", "w2", "w3"]
+        ]
+
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in hf_model_weights_iterator(
                 model_name_or_path,
@@ -387,6 +386,7 @@ def load_weights(self,
                 fall_back_to_pt=False):
             if "rotary_emb.inv_freq" in name:
                 continue
+
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -399,14 +399,22 @@ def load_weights(self,
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if ("block_sparse_moe.experts." in name
-                        and name not in params_dict):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+                for param_name, weight_name, expert_id in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  weight_name,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)

From d79ced3292445d8471b3c4e5ce2dbf311834ec1b Mon Sep 17 00:00:00 2001
From: Wen Sun <35923278+HermitSun@users.noreply.github.com>
Date: Wed, 31 Jan 2024 00:17:05 +0800
Subject: [PATCH 017/112] Fix 'Actor methods cannot be called directly' when
 using `--engine-use-ray` (#2664)

* fix: engine-useray complain

* fix: typo
---
 vllm/engine/async_llm_engine.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 2885aab9f3161..a63d48016b83c 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -447,11 +447,19 @@ async def add_request(
 
         if arrival_time is None:
             arrival_time = time.time()
-        prompt_token_ids = await self.engine.encode_request_async(
-            request_id=request_id,
-            prompt=prompt,
-            prompt_token_ids=prompt_token_ids,
-            lora_request=lora_request)
+
+        if self.engine_use_ray:
+            prompt_token_ids = await self.engine.encode_request_async.remote(
+                request_id=request_id,
+                prompt=prompt,
+                prompt_token_ids=prompt_token_ids,
+                lora_request=lora_request)
+        else:
+            prompt_token_ids = await self.engine.encode_request_async(
+                request_id=request_id,
+                prompt=prompt,
+                prompt_token_ids=prompt_token_ids,
+                lora_request=lora_request)
 
         stream = self._request_tracker.add_request(
             request_id,

From 4f65af0e252066d961bf864d0862f442e497f619 Mon Sep 17 00:00:00 2001
From: Vladimir <vladimir.ovsyannikov@gmail.com>
Date: Tue, 30 Jan 2024 18:30:50 +0100
Subject: [PATCH 018/112] Add swap_blocks unit tests (#2616)

---
 tests/kernels/test_cache.py | 68 +++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 193bc29bd431d..320ea0c095ac2 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -3,8 +3,11 @@
 import pytest
 import torch
 
+from typing import Tuple
+
 from vllm._C import cache_ops
 
+COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
@@ -153,3 +156,68 @@ def test_reshape_and_cache(
 
     assert torch.allclose(key_cache, cloned_key_cache)
     assert torch.allclose(value_cache, cloned_value_cache)
+
+
+@pytest.mark.parametrize("direction", COPYING_DIRECTION)
+@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", DEVICES)
+@torch.inference_mode()
+def test_swap_blocks(
+    kv_cache_factory,
+    direction: Tuple[str, str],
+    num_mappings: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: int,
+) -> None:
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    src_device = f"{direction[0]}:{device}" if direction[
+        0] == "cuda" else direction[0]
+    dst_device = f"{direction[1]}:{device}" if direction[
+        1] == "cuda" else direction[1]
+
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    # For the same device, mapping must not overlap
+    if src_device == dst_device:
+        remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
+        dst_blocks = random.sample(remaining_blocks, num_mappings)
+    else:
+        dst_blocks = random.sample(range(num_blocks), num_mappings)
+
+    block_mapping = dict(zip(src_blocks, dst_blocks))
+
+    # Create the KV caches on the first device.
+    src_key_caches, src_value_caches = kv_cache_factory(
+        num_blocks, block_size, 1, num_heads, head_size, dtype, seed,
+        src_device)
+
+    # Create the KV caches on the second device.
+    dist_key_caches, dist_value_caches = kv_cache_factory(
+        num_blocks, block_size, 1, num_heads, head_size, dtype, seed,
+        dst_device)
+
+    src_key_caches_clone = src_key_caches[0].clone()
+    src_value_caches_clone = src_value_caches[0].clone()
+
+    # Call the swap_blocks kernel.
+    cache_ops.swap_blocks(src_key_caches[0], dist_key_caches[0], block_mapping)
+    cache_ops.swap_blocks(src_value_caches[0], dist_value_caches[0],
+                          block_mapping)
+
+    for src, dst in block_mapping.items():
+        assert torch.allclose(src_key_caches_clone[src].cpu(),
+                              dist_key_caches[0][dst].cpu())
+        assert torch.allclose(src_value_caches_clone[src].cpu(),
+                              dist_value_caches[0][dst].cpu())

From bbe9bd9684218038cca9663ac79dfe7c4f28a351 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Tue, 30 Jan 2024 13:40:37 -0800
Subject: [PATCH 019/112] [Minor] Fix a small typo (#2672)

---
 vllm/model_executor/parallel_utils/parallel_state.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/parallel_utils/parallel_state.py b/vllm/model_executor/parallel_utils/parallel_state.py
index 46bff7e16b79f..59cc196538571 100644
--- a/vllm/model_executor/parallel_utils/parallel_state.py
+++ b/vllm/model_executor/parallel_utils/parallel_state.py
@@ -117,7 +117,7 @@ def model_parallel_is_initialized():
 def get_tensor_model_parallel_group():
     """Get the tensor model parallel group the caller rank belongs to."""
     assert _TENSOR_MODEL_PARALLEL_GROUP is not None, (
-        "tenosr model parallel group is not initialized")
+        "tensor model parallel group is not initialized")
     return _TENSOR_MODEL_PARALLEL_GROUP
 
 

From 105a40f53a8001ce7f4282462a4636b31383f393 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 30 Jan 2024 14:39:40 -0800
Subject: [PATCH 020/112] [Minor] Fix false warning when TP=1 (#2674)

---
 vllm/model_executor/parallel_utils/custom_all_reduce.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/model_executor/parallel_utils/custom_all_reduce.py b/vllm/model_executor/parallel_utils/custom_all_reduce.py
index 5b88649cc2129..628c151761fb2 100644
--- a/vllm/model_executor/parallel_utils/custom_all_reduce.py
+++ b/vllm/model_executor/parallel_utils/custom_all_reduce.py
@@ -29,6 +29,10 @@ def init_custom_ar() -> None:
         return
     rank = get_tensor_model_parallel_rank()
     world_size = get_tensor_model_parallel_world_size()
+    if world_size == 1:
+        # No need to initialize custom allreduce for single GPU case.
+        return
+
     if world_size not in _SUPPORTED_WORLD_SIZES:
         logger.warn(
             "Custom allreduce is disabled due to an unsupported world size: "

From 3dad94448583a835230ae68c726d4b67e46845f2 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 30 Jan 2024 16:34:10 -0800
Subject: [PATCH 021/112] Add quantized mixtral support (#2673)

---
 vllm/model_executor/model_loader.py         |  13 +-
 vllm/model_executor/models/__init__.py      |   1 +
 vllm/model_executor/models/mixtral_quant.py | 412 ++++++++++++++++++++
 3 files changed, 422 insertions(+), 4 deletions(-)
 create mode 100644 vllm/model_executor/models/mixtral_quant.py

diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
index 0f1125e5c8e3e..bf13ebf57d422 100644
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@@ -4,7 +4,6 @@
 
 import torch
 import torch.nn as nn
-from transformers import PretrainedConfig
 
 from vllm.config import ModelConfig, LoRAConfig
 from vllm.model_executor.models import ModelRegistry
@@ -21,8 +20,14 @@ def _set_default_torch_dtype(dtype: torch.dtype):
     torch.set_default_dtype(old_dtype)
 
 
-def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
-    architectures = getattr(config, "architectures", [])
+def _get_model_architecture(model_config: ModelConfig) -> Type[nn.Module]:
+    architectures = getattr(model_config.hf_config, "architectures", [])
+    # Special handling for quantized Mixtral.
+    # FIXME(woosuk): This is a temporary hack.
+    if (model_config.quantization is not None
+            and "MixtralForCausalLM" in architectures):
+        architectures = ["QuantMixtralForCausalLM"]
+
     for arch in architectures:
         model_cls = ModelRegistry.load_model_cls(arch)
         if model_cls is not None:
@@ -34,7 +39,7 @@ def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
 
 def get_model(model_config: ModelConfig,
               lora_config: Optional[LoRAConfig] = None) -> nn.Module:
-    model_class = _get_model_architecture(model_config.hf_config)
+    model_class = _get_model_architecture(model_config)
 
     # Get the (maybe quantized) linear method.
     linear_method = None
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 93631d260abcb..a26a513a60036 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -30,6 +30,7 @@
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MistralForCausalLM": ("mistral", "MistralForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
+    "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
     # transformers's mpt class has lower case
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
new file mode 100644
index 0000000000000..a8dadce24aa1d
--- /dev/null
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -0,0 +1,412 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Mixtral model."""
+from typing import List, Optional, Tuple
+
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+
+from torch import nn
+from transformers import MixtralConfig
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               ReplicatedLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
+from vllm.model_executor.parallel_utils.communication_op import (
+    tensor_model_parallel_all_reduce)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.weight_utils import (default_weight_loader,
+                                              hf_model_weights_iterator)
+from vllm.sequence import SamplerOutput
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class MixtralMLP(nn.Module):
+
+    def __init__(
+        self,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.num_experts = num_experts
+        self.ffn_dim = intermediate_size
+        self.hidden_dim = hidden_size
+
+        self.w1 = ReplicatedLinear(self.hidden_dim,
+                                   self.ffn_dim,
+                                   bias=False,
+                                   linear_method=linear_method)
+        self.w2 = ReplicatedLinear(self.ffn_dim,
+                                   self.hidden_dim,
+                                   bias=False,
+                                   linear_method=linear_method)
+        self.w3 = ReplicatedLinear(self.hidden_dim,
+                                   self.ffn_dim,
+                                   bias=False,
+                                   linear_method=linear_method)
+
+        # TODO: Use vllm's SiluAndMul
+        self.act_fn = nn.SiLU()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        w1_out, _ = self.w1(hidden_states)
+        w1_out = self.act_fn(w1_out)
+        w3_out, _ = self.w3(hidden_states)
+        current_hidden_states = w1_out * w3_out
+        current_hidden_states, _ = self.w2(current_hidden_states)
+        return current_hidden_states
+
+
+class MixtralMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: MixtralConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_total_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+        if self.tp_size > self.num_total_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {self.num_total_experts}.")
+        # Split experts equally between ranks
+        self.expert_indicies = np.array_split(range(
+            self.num_total_experts), self.tp_size)[self.rank].tolist()
+        if not self.expert_indicies:
+            raise ValueError(
+                f"Rank {self.rank} has no experts assigned to it.")
+
+        self.experts = nn.ModuleList([
+            MixtralMLP(self.num_total_experts,
+                       config.hidden_size,
+                       config.intermediate_size,
+                       linear_method=linear_method)
+            if idx in self.expert_indicies else None
+            for idx in range(self.num_total_experts)
+        ])
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     self.num_total_experts,
+                                     bias=False,
+                                     linear_method=None)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights,
+                                                       self.top_k,
+                                                       dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+        final_hidden_states = None
+        for expert_idx in self.expert_indicies:
+            expert_layer = self.experts[expert_idx]
+            expert_mask = (selected_experts == expert_idx)
+            expert_weights = (routing_weights * expert_mask).sum(dim=-1,
+                                                                 keepdim=True)
+
+            current_hidden_states = expert_layer(hidden_states).mul_(
+                expert_weights)
+            if final_hidden_states is None:
+                final_hidden_states = current_hidden_states
+            else:
+                final_hidden_states.add_(current_hidden_states)
+
+        return tensor_model_parallel_all_reduce(final_hidden_states).view(
+            batch_size, sequence_length, hidden_dim)
+
+
+class MixtralAttention(nn.Module):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 max_position: int = 4096 * 32,
+                 rope_theta: float = 10000,
+                 linear_method: Optional[LinearMethodBase] = None,
+                 sliding_window: Optional[int] = None) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.sliding_window = sliding_window
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+        self.attn = PagedAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            sliding_window=self.sliding_window,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MixtralDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: MixtralConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = MixtralAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            sliding_window=config.sliding_window,
+            linear_method=linear_method)
+        self.block_sparse_moe = MixtralMoE(config=config,
+                                           linear_method=linear_method)
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.block_sparse_moe(hidden_states)
+        return hidden_states, residual
+
+
+class MixtralModel(nn.Module):
+
+    def __init__(
+        self,
+        config: MixtralConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            MixtralDecoderLayer(config, linear_method=linear_method)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states,
+                                            kv_caches[i], input_metadata,
+                                            residual)
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class MixtralForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: MixtralConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = MixtralModel(config, linear_method)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   sampling_metadata)
+        return next_tokens
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     load_format: str = "auto",
+                     revision: Optional[str] = None):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path,
+                cache_dir,
+                load_format,
+                revision,
+                fall_back_to_pt=False):
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if ("block_sparse_moe.experts." in name
+                        and name not in params_dict):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)

From 1af090b57d0e23d268e79941f8084bf0a8ad8621 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Wed, 31 Jan 2024 00:07:07 -0800
Subject: [PATCH 022/112] Bump up version to v0.3.0 (#2656)

---
 README.md             | 4 +++-
 docs/source/index.rst | 4 +++-
 vllm/__init__.py      | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index c0d267b2cbbf3..3853760613833 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ vLLM is fast with:
 - Efficient management of attention key and value memory with **PagedAttention**
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
-- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629)
+- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
 - Optimized CUDA kernels
 
 vLLM is flexible and easy to use with:
@@ -57,6 +57,8 @@ vLLM is flexible and easy to use with:
 - Streaming outputs
 - OpenAI-compatible API server
 - Support NVIDIA GPUs and AMD GPUs
+- (Experimental) Prefix caching support
+- (Experimental) Multi-lora support
 
 vLLM seamlessly supports many Hugging Face models, including the following architectures:
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 321f855645bb8..3e2331907f0f2 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -31,7 +31,7 @@ vLLM is fast with:
 * Efficient management of attention key and value memory with **PagedAttention**
 * Continuous batching of incoming requests
 * Fast model execution with CUDA/HIP graph
-* Quantization: `GPTQ <https://arxiv.org/abs/2210.17323>`_, `AWQ <https://arxiv.org/abs/2306.00978>`_, `SqueezeLLM <https://arxiv.org/abs/2306.07629>`_
+* Quantization: `GPTQ <https://arxiv.org/abs/2210.17323>`_, `AWQ <https://arxiv.org/abs/2306.00978>`_, `SqueezeLLM <https://arxiv.org/abs/2306.07629>`_, FP8 KV Cache
 * Optimized CUDA kernels
 
 vLLM is flexible and easy to use with:
@@ -42,6 +42,8 @@ vLLM is flexible and easy to use with:
 * Streaming outputs
 * OpenAI-compatible API server
 * Support NVIDIA GPUs and AMD GPUs
+* (Experimental) Prefix caching support
+* (Experimental) Multi-lora support
 
 For more information, check out the following:
 
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 327dfad06352c..36d177f5942e7 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -8,7 +8,7 @@
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import SamplingParams
 
-__version__ = "0.2.7"
+__version__ = "0.3.0"
 
 __all__ = [
     "LLM",

From d69ff0cbbbb07b571eeea62b4e2ce87b91cea387 Mon Sep 17 00:00:00 2001
From: Tao He <sighingnow@gmail.com>
Date: Thu, 1 Feb 2024 01:00:13 +0800
Subject: [PATCH 023/112] Fixes assertion failure in prefix caching: the lora
 index mapping should respect prefix_len (#2688)

Signed-off-by: Tao He <sighingnow@gmail.com>
---
 vllm/worker/model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 2a12152a70863..2df9fd5215a2d 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -142,10 +142,10 @@ def _prepare_prompt(
             if lora_id > 0:
                 lora_requests.add(seq_group_metadata.lora_request)
 
-            lora_index_mapping.append([lora_id] * prompt_len)
+            lora_index_mapping.append([lora_id] * (prompt_len - prefix_len))
             lora_prompt_mapping.extend(
                 [lora_id] *
-                (prompt_len
+                (prompt_len - prefix_len
                  if seq_group_metadata.sampling_params.prompt_logprobs else 1))
 
             if seq_group_metadata.block_tables is None:

From c664b0e6838644c22839b6e9c88e61b4e9a540f6 Mon Sep 17 00:00:00 2001
From: zspo <songpo.zhang@foxmail.com>
Date: Thu, 1 Feb 2024 02:09:23 +0800
Subject: [PATCH 024/112] fix some bugs (#2689)

---
 vllm/config.py                  | 5 ++++-
 vllm/engine/async_llm_engine.py | 6 ++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 197f20c1ec9a5..4fb7357a3da21 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -355,6 +355,9 @@ class ParallelConfig:
         worker_use_ray: Whether to use Ray for model workers. Will be set to
             True if either pipeline_parallel_size or tensor_parallel_size is
             greater than 1.
+        max_parallel_loading_workers: Maximum number of multiple batches
+            when load model sequentially. To avoid RAM OOM when using tensor
+            parallel and large models.
         disable_custom_all_reduce: Disable the custom all-reduce kernel and
             fall back to NCCL.
     """
@@ -470,7 +473,7 @@ def __post_init__(self):
         elif self.max_cpu_loras < self.max_loras:
             raise ValueError(
                 f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
-                f"max_num_seqs ({self.max_loras})")
+                f"max_loras ({self.max_loras})")
 
     def verify_with_model_config(self, model_config: ModelConfig):
         if self.lora_dtype in (None, "auto"):
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index a63d48016b83c..7cba654602779 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -296,6 +296,8 @@ class AsyncLLMEngine:
             async frontend will be executed in a separate process as the
             model workers.
         log_requests: Whether to log the requests.
+        max_log_len: Maximum number of prompt characters or prompt ID numbers
+            being printed in log.
         start_engine_loop: If True, the background task to run the engine
             will be automatically started in the generate call.
         *args: Arguments for LLMEngine.
@@ -431,8 +433,8 @@ async def add_request(
             logger.info(f"Received request {request_id}: "
                         f"prompt: {shortened_prompt!r}, "
                         f"prefix_pos: {prefix_pos},"
-                        f"sampling params: {sampling_params}, "
-                        f"prompt token ids: {shortened_token_ids}, "
+                        f"sampling_params: {sampling_params}, "
+                        f"prompt_token_ids: {shortened_token_ids}, "
                         f"lora_request: {lora_request}.")
 
         if not self.is_running:

From 89efcf1ce53cd01c27384e3c3e1c6b0761978076 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 31 Jan 2024 10:12:11 -0800
Subject: [PATCH 025/112] [Minor] Fix test_cache.py CI test failure (#2684)

---
 tests/kernels/test_cache.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 320ea0c095ac2..275ef8194d0bd 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -200,12 +200,12 @@ def test_swap_blocks(
 
     # Create the KV caches on the first device.
     src_key_caches, src_value_caches = kv_cache_factory(
-        num_blocks, block_size, 1, num_heads, head_size, dtype, seed,
+        num_blocks, block_size, 1, num_heads, head_size, dtype, None, seed,
         src_device)
 
     # Create the KV caches on the second device.
     dist_key_caches, dist_value_caches = kv_cache_factory(
-        num_blocks, block_size, 1, num_heads, head_size, dtype, seed,
+        num_blocks, block_size, 1, num_heads, head_size, dtype, None, seed,
         dst_device)
 
     src_key_caches_clone = src_key_caches[0].clone()

From d0d93b92b190f420e2628350ec69921bede691d4 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 31 Jan 2024 14:34:17 -0800
Subject: [PATCH 026/112] Add unit test for Mixtral MoE layer (#2677)

---
 Dockerfile                              |   6 ++
 tests/kernels/test_fused_moe.py         |  50 ------------
 tests/kernels/test_moe.py               | 104 ++++++++++++++++++++++++
 vllm/model_executor/layers/fused_moe.py |   4 +-
 vllm/model_executor/models/mixtral.py   |  10 ++-
 5 files changed, 119 insertions(+), 55 deletions(-)
 delete mode 100644 tests/kernels/test_fused_moe.py
 create mode 100644 tests/kernels/test_moe.py

diff --git a/Dockerfile b/Dockerfile
index 4cfcf058004c5..364345d60e041 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,6 +7,12 @@ FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
 RUN apt-get update -y \
     && apt-get install -y python3-pip git
 
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-12.1/compat/
+
 WORKDIR /workspace
 
 # install build and runtime dependencies
diff --git a/tests/kernels/test_fused_moe.py b/tests/kernels/test_fused_moe.py
deleted file mode 100644
index 80a0349d6575b..0000000000000
--- a/tests/kernels/test_fused_moe.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import pytest
-import torch
-
-from vllm.model_executor.layers.fused_moe import fused_moe
-from vllm.model_executor.layers.activation import SiluAndMul
-
-
-def torch_moe(a, w1, w2, topk_weight, topk_ids):
-    B, D = a.shape
-    a = a.view(B, -1, D).repeat(1, topk_ids.shape[1], 1).reshape(-1, D)
-    out = torch.zeros(B * topk_ids.shape[1],
-                      w2.shape[1],
-                      dtype=a.dtype,
-                      device=a.device)
-    topk_ids = topk_ids.view(-1)
-    topk_weight = topk_weight.view(-1)
-    for i in range(w1.shape[0]):
-        mask = topk_ids == i
-        if mask.sum():
-            out[mask] = SiluAndMul()(
-                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
-    return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1)).sum(dim=1)
-
-
-@pytest.mark.parametrize("m", [512, 222, 33, 1])
-@pytest.mark.parametrize("n", [2048, 256, 1024])
-@pytest.mark.parametrize("k", [128, 511, 1024])
-@pytest.mark.parametrize("e", [8, 64])
-@pytest.mark.parametrize("topk", [2, 6])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-def test_fused_moe(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    dtype: torch.dtype,
-):
-    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
-    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
-
-    score = torch.randn((m, e), device='cuda', dtype=dtype)
-    score = torch.softmax(score, dim=-1)
-    topk_weight, topk_ids = torch.topk(score, topk)
-
-    triton_output = fused_moe(a, w1, w2, topk_weight, topk_ids, False)
-    torch_output = torch_moe(a, w1, w2, topk_weight, topk_ids)
-    assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
new file mode 100644
index 0000000000000..227ddfc3661b3
--- /dev/null
+++ b/tests/kernels/test_moe.py
@@ -0,0 +1,104 @@
+"""Tests for the MOE layers.
+
+Run `pytest tests/kernels/test_moe.py`.
+"""
+
+import pytest
+import torch
+
+from transformers import MixtralConfig
+from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
+
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.models.mixtral import MixtralMoE
+
+
+def torch_moe(a, w1, w2, topk_weight, topk_ids):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk_ids.shape[1], 1).reshape(-1, D)
+    out = torch.zeros(B * topk_ids.shape[1],
+                      w2.shape[1],
+                      dtype=a.dtype,
+                      device=a.device)
+    topk_ids = topk_ids.view(-1)
+    topk_weight = topk_weight.view(-1)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = SiluAndMul()(
+                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1)).sum(dim=1)
+
+
+@pytest.mark.parametrize("m", [512, 222, 33, 1])
+@pytest.mark.parametrize("n", [2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("e", [8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_fused_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+):
+    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
+
+    score = torch.randn((m, e), device='cuda', dtype=dtype)
+    score = torch.softmax(score, dim=-1)
+    topk_weight, topk_ids = torch.topk(score, topk)
+
+    triton_output = fused_moe(a, w1, w2, topk_weight, topk_ids, False)
+    torch_output = torch_moe(a, w1, w2, topk_weight, topk_ids)
+    assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
+
+
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@torch.inference_mode()
+def test_mixtral_moe(dtype: torch.dtype):
+    "Make sure our Mixtral MoE implementation agrees with the one from huggingface."
+
+    # Instantiate our and huggingface's MoE blocks
+    config = MixtralConfig()
+    hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
+    vllm_moe = MixtralMoE(
+        num_experts=config.num_local_experts,
+        top_k=config.num_experts_per_tok,
+        hidden_size=config.hidden_size,
+        intermediate_size=config.intermediate_size,
+        params_dtype=dtype,
+        tp_size=1,
+    )
+
+    # Load the weights
+    vllm_moe.gate.linear_weights["weight"][:] = hf_moe.gate.weight.data
+    for i in range(config.num_local_experts):
+        weights = (hf_moe.experts[i].w1.weight.data,
+                   hf_moe.experts[i].w3.weight.data)
+        vllm_moe.ws[i][:] = torch.cat(weights, dim=0)
+        vllm_moe.w2s[i][:] = hf_moe.experts[i].w2.weight.data
+
+    # Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
+    inputs = torch.randn((1, 64, config.hidden_size)).to(dtype).to("cuda")
+
+    # Run forward passes for both MoE blocks
+    hf_states, _ = hf_moe.forward(inputs)
+    vllm_states = vllm_moe.forward(inputs)
+
+    mixtral_moe_tol = {
+        torch.float32: 1e-3,
+        torch.float16: 1e-3,
+        torch.bfloat16: 1e-2,
+    }
+
+    assert torch.allclose(hf_states,
+                          vllm_states,
+                          rtol=mixtral_moe_tol[dtype],
+                          atol=mixtral_moe_tol[dtype])
diff --git a/vllm/model_executor/layers/fused_moe.py b/vllm/model_executor/layers/fused_moe.py
index 998062d82d1f0..eed2e83bed7f8 100644
--- a/vllm/model_executor/layers/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe.py
@@ -235,7 +235,9 @@ def fused_moe(hidden_states: torch.Tensor,
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
     assert w2.is_contiguous(), "Expert weights2 must be contiguous"
-    assert hidden_states.dtype in [torch.float16, torch.bfloat16]
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
     M, _ = hidden_states.shape
     E, N, _ = w1.shape
 
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index f36c35fd27ad5..a8e470395b904 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -70,13 +70,14 @@ def __init__(
         hidden_size: int,
         intermediate_size: int,
         params_dtype: Optional[torch.dtype] = None,
+        tp_size: Optional[int] = None,
     ):
         super().__init__()
-        tp_size = get_tensor_model_parallel_world_size()
+        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
         self.num_total_experts = num_experts
         self.top_k = top_k
         self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size // tp_size
+        self.intermediate_size = intermediate_size // self.tp_size
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -141,8 +142,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                                         selected_experts,
                                         inplace=True)
 
-        final_hidden_states = tensor_model_parallel_all_reduce(
-            final_hidden_states)
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
 
         return final_hidden_states.view(batch_size, sequence_length,
                                         hidden_size)

From 93b38bea5dd03e1b140ca997dfaadef86f8f1855 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+rib-2@users.noreply.github.com>
Date: Wed, 31 Jan 2024 14:58:07 -0800
Subject: [PATCH 027/112] Refactor Prometheus and Add Request Level Metrics
 (#2316)

---
 examples/production_monitoring/README.md      |  54 +
 .../production_monitoring/docker-compose.yaml |  19 +
 examples/production_monitoring/grafana.json   | 931 ++++++++++++++++++
 .../production_monitoring/prometheus.yaml     |  10 +
 vllm/engine/llm_engine.py                     | 148 ++-
 vllm/engine/metrics.py                        | 166 +++-
 vllm/sequence.py                              |   8 +-
 7 files changed, 1234 insertions(+), 102 deletions(-)
 create mode 100644 examples/production_monitoring/README.md
 create mode 100644 examples/production_monitoring/docker-compose.yaml
 create mode 100644 examples/production_monitoring/grafana.json
 create mode 100644 examples/production_monitoring/prometheus.yaml

diff --git a/examples/production_monitoring/README.md b/examples/production_monitoring/README.md
new file mode 100644
index 0000000000000..29b611caeda23
--- /dev/null
+++ b/examples/production_monitoring/README.md
@@ -0,0 +1,54 @@
+# vLLM + Prometheus/Grafana 
+
+This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites. 
+
+Install: 
+- [`docker`](https://docs.docker.com/engine/install/)
+- [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
+
+### Launch
+
+Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
+```bash
+python3 -m vllm.entrypoints.openai.api_server \
+    --model mistralai/Mistral-7B-v0.1 \
+    --max-model-len 2048 \
+    --disable-log-requests
+```
+
+Launch Prometheus and Grafana servers with `docker compose`:
+```bash
+docker compose up
+```
+
+Submit some sample requests to the server:
+```bash
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+python3 ../../benchmarks/benchmark_serving.py \
+    --model mistralai/Mistral-7B-v0.1 \
+    --tokenizer mistralai/Mistral-7B-v0.1 \
+    --endpoint /v1/completions \
+    --dataset ShareGPT_V3_unfiltered_cleaned_split.json \
+    --request-rate 3.0
+```
+
+Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM.
+
+### Grafana Dashboard
+
+Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`).
+
+#### Add Prometheus Data Source
+
+Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. 
+
+On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`.
+
+Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
+
+#### Import Dashboard 
+
+Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:
+
+![Grafana Dashboard Image](https://i.imgur.com/R2vH9VW.png)
diff --git a/examples/production_monitoring/docker-compose.yaml b/examples/production_monitoring/docker-compose.yaml
new file mode 100644
index 0000000000000..13b987c120f7d
--- /dev/null
+++ b/examples/production_monitoring/docker-compose.yaml
@@ -0,0 +1,19 @@
+# docker-compose.yaml
+version: "3"
+
+services:
+  prometheus:
+    image: prom/prometheus:latest
+    extra_hosts:
+      - "host.docker.internal:host-gateway"     # allow a direct connection from container to the local machine
+    ports:
+      - "9090:9090"   # the default port used by Prometheus
+    volumes:
+      - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
+
+  grafana:
+    image: grafana/grafana:latest
+    depends_on:
+      - prometheus
+    ports:
+      - "3000:3000" # the default port used by Grafana
diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json
new file mode 100644
index 0000000000000..f48b6314eb055
--- /dev/null
+++ b/examples/production_monitoring/grafana.json
@@ -0,0 +1,931 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_PROMETHEUS",
+      "label": "prometheus",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "prometheus",
+      "pluginName": "Prometheus"
+    }
+  ],
+  "__elements": {},
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "10.2.3"
+    },
+    {
+      "type": "datasource",
+      "id": "prometheus",
+      "name": "Prometheus",
+      "version": "1.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "timeseries",
+      "name": "Time series",
+      "version": ""
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "Monitoring vLLM Inference Server",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "End to end request latency measured in seconds.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "id": 9,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P95",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(vllm:e2e_request_latency_seconds_sum[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Average",
+          "range": true,
+          "refId": "E"
+        }
+      ],
+      "title": "E2E Request Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Number of tokens processed per second",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "id": 8,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "rate(vllm:prompt_tokens_total[$__rate_interval])",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "Prompt Tokens/Sec",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "rate(vllm:generation_tokens_total[$__rate_interval])",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "Generation Tokens/Sec",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        }
+      ],
+      "title": "Token Throughput",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Inter token latency in seconds.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "id": 10,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P95",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(vllm:time_per_output_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Mean",
+          "range": true,
+          "refId": "E"
+        }
+      ],
+      "title": "Time Per Output Token Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Number of requests in RUNNING, WAITING, and SWAPPED state",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
+      "id": 3,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "vllm:num_requests_running",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Num Running",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "vllm:num_requests_swapped",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Num Swapped",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "vllm:num_requests_waiting",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Num Waiting",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        }
+      ],
+      "title": "Scheduler State",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "P50, P90, P95, and P99 TTFT latency in seconds.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 16
+      },
+      "id": 5,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P95",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": false,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(vllm:time_to_first_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Average",
+          "range": true,
+          "refId": "E"
+        }
+      ],
+      "title": "Time To First Token Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Percentage of used cache blocks by vLLM.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "percentunit"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 16
+      },
+      "id": 4,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "vllm:gpu_cache_usage_perc",
+          "instant": false,
+          "legendFormat": "GPU Cache Usage",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "vllm:cpu_cache_usage_perc",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "CPU Cache Usage",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Cache Utilization",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "",
+  "schemaVersion": 39,
+  "tags": [],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-5m",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "vLLM",
+  "uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b",
+  "version": 2,
+  "weekStart": ""
+}
diff --git a/examples/production_monitoring/prometheus.yaml b/examples/production_monitoring/prometheus.yaml
new file mode 100644
index 0000000000000..754533b9dfbd0
--- /dev/null
+++ b/examples/production_monitoring/prometheus.yaml
@@ -0,0 +1,10 @@
+# prometheus.yaml
+global:
+  scrape_interval: 5s
+  evaluation_interval: 30s
+
+scrape_configs:
+  - job_name: vllm
+    static_configs:
+      - targets:
+          - 'host.docker.internal:8000'
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 0d836a1fb13a9..e60efc5e54e16 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -10,7 +10,7 @@
                          SchedulerConfig, LoRAConfig)
 from vllm.core.scheduler import Scheduler, SchedulerOutputs
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.metrics import record_metrics
+from vllm.engine.metrics import StatLogger, Stats
 from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
@@ -28,8 +28,7 @@
     from ray.util.placement_group import PlacementGroup
 
 logger = init_logger(__name__)
-
-_LOGGING_INTERVAL_SEC = 5
+_LOCAL_LOGGING_INTERVAL_SEC = 5
 
 
 class LLMEngine:
@@ -116,12 +115,10 @@ def __init__(
         # Create the scheduler.
         self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
 
-        # Logging.
-        self.last_logging_time = 0.0
-        # List of (timestamp, num_tokens)
-        self.num_prompt_tokens: List[Tuple[float, int]] = []
-        # List of (timestamp, num_tokens)
-        self.num_generation_tokens: List[Tuple[float, int]] = []
+        # Metric Logging.
+        if self.log_stats:
+            self.stat_logger = StatLogger(
+                local_interval=_LOCAL_LOGGING_INTERVAL_SEC)
 
     def get_tokenizer_for_seq(self, sequence: Sequence):
         return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
@@ -537,6 +534,7 @@ def _check_beam_search_early_stopping(
 
     def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
                                         outputs: SequenceGroupOutput) -> None:
+
         # Process prompt logprobs
         prompt_logprobs = outputs.prompt_logprobs
         if prompt_logprobs is not None:
@@ -732,10 +730,10 @@ def _process_model_outputs(
                     and not seq_group.prefix.computed):
                 seq_group.prefix.computed = True
 
+        # Log stats.
         if self.log_stats:
-            # Log the system stats.
-            self._log_system_stats(scheduler_outputs.prompt_run,
-                                   scheduler_outputs.num_batched_tokens)
+            self.stat_logger.log(self._get_stats(scheduler_outputs))
+
         return request_outputs
 
     def step(self) -> List[RequestOutput]:
@@ -810,81 +808,73 @@ def step(self) -> List[RequestOutput]:
         return self._process_model_outputs(output, scheduler_outputs)
 
     def do_log_stats(self) -> None:
-        self._log_system_stats(False, 0)
+        """Forced log when no requests active."""
+        if self.log_stats:
+            self.stat_logger.log(self._get_stats(scheduler_outputs=None))
 
-    def _log_system_stats(
-        self,
-        prompt_run: bool,
-        num_batched_tokens: int,
-    ) -> None:
+    def _get_stats(self,
+                   scheduler_outputs: Optional[SchedulerOutputs]) -> Stats:
+        """Get Stats to be Logged to Prometheus."""
         now = time.monotonic()
-        # Log the number of batched input tokens.
-        if prompt_run:
-            self.num_prompt_tokens.append((now, num_batched_tokens))
-        else:
-            self.num_generation_tokens.append((now, num_batched_tokens))
 
-        should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC
-        if not should_log:
-            return
+        # KV Cache Usage in %.
+        num_total_gpu = self.cache_config.num_gpu_blocks
+        num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks()
+        gpu_cache_usage = 1.0 - (num_free_gpu / num_total_gpu)
 
-        # Discard the old stats.
-        self.num_prompt_tokens = [(t, n) for t, n in self.num_prompt_tokens
-                                  if now - t < _LOGGING_INTERVAL_SEC]
-        self.num_generation_tokens = [(t, n)
-                                      for t, n in self.num_generation_tokens
-                                      if now - t < _LOGGING_INTERVAL_SEC]
-
-        if len(self.num_prompt_tokens) > 1:
-            total_num_tokens = sum(n for _, n in self.num_prompt_tokens[:-1])
-            window = now - self.num_prompt_tokens[0][0]
-            avg_prompt_throughput = total_num_tokens / window
-        else:
-            avg_prompt_throughput = 0.0
-        if len(self.num_generation_tokens) > 1:
-            total_num_tokens = sum(n
-                                   for _, n in self.num_generation_tokens[:-1])
-            window = now - self.num_generation_tokens[0][0]
-            avg_generation_throughput = total_num_tokens / window
-        else:
-            avg_generation_throughput = 0.0
-
-        total_num_gpu_blocks = self.cache_config.num_gpu_blocks
-        num_free_gpu_blocks = (
-            self.scheduler.block_manager.get_num_free_gpu_blocks())
-        num_used_gpu_blocks = total_num_gpu_blocks - num_free_gpu_blocks
-        gpu_cache_usage = num_used_gpu_blocks / total_num_gpu_blocks
-
-        total_num_cpu_blocks = self.cache_config.num_cpu_blocks
-        if total_num_cpu_blocks > 0:
-            num_free_cpu_blocks = (
-                self.scheduler.block_manager.get_num_free_cpu_blocks())
-            num_used_cpu_blocks = total_num_cpu_blocks - num_free_cpu_blocks
-            cpu_cache_usage = num_used_cpu_blocks / total_num_cpu_blocks
-        else:
-            cpu_cache_usage = 0.0
-
-        record_metrics(
-            avg_prompt_throughput=avg_prompt_throughput,
-            avg_generation_throughput=avg_generation_throughput,
-            scheduler_running=len(self.scheduler.running),
-            scheduler_swapped=len(self.scheduler.swapped),
-            scheduler_waiting=len(self.scheduler.waiting),
+        num_total_cpu = self.cache_config.num_cpu_blocks
+        cpu_cache_usage = 0.
+        if num_total_cpu > 0:
+            num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks(
+            )
+            cpu_cache_usage = 1.0 - (num_free_cpu / num_total_cpu)
+
+        # Scheduler State
+        num_running = len(self.scheduler.running)
+        num_swapped = len(self.scheduler.swapped)
+        num_waiting = len(self.scheduler.waiting)
+
+        # Iteration stats if we have scheduler output.
+        num_prompt_tokens = 0
+        num_generation_tokens = 0
+        time_to_first_tokens = []
+        time_per_output_tokens = []
+        time_e2e_requests = []
+        if scheduler_outputs is not None:
+            prompt_run = scheduler_outputs.prompt_run
+
+            # Number of Tokens.
+            if prompt_run:
+                num_prompt_tokens = scheduler_outputs.num_batched_tokens
+            else:
+                num_generation_tokens = scheduler_outputs.num_batched_tokens
+
+            # Latency Timings.
+            time_last_iters = []
+            for seq_group in scheduler_outputs.scheduled_seq_groups:
+                # Time since last token. (n.b. updates seq_group.last_token_time)
+                time_last_iters.append(seq_group.get_last_latency(now))
+                # Time since arrival for all finished requests.
+                if seq_group.is_finished():
+                    time_e2e_requests.append(now - seq_group.arrival_time)
+
+            time_to_first_tokens = time_last_iters if prompt_run else []
+            time_per_output_tokens = [] if prompt_run else time_last_iters
+
+        return Stats(
+            now=now,
+            num_running=num_running,
+            num_swapped=num_swapped,
+            num_waiting=num_waiting,
             gpu_cache_usage=gpu_cache_usage,
             cpu_cache_usage=cpu_cache_usage,
+            num_prompt_tokens=num_prompt_tokens,
+            num_generation_tokens=num_generation_tokens,
+            time_to_first_tokens=time_to_first_tokens,
+            time_per_output_tokens=time_per_output_tokens,
+            time_e2e_requests=time_e2e_requests,
         )
 
-        logger.info("Avg prompt throughput: "
-                    f"{avg_prompt_throughput:.1f} tokens/s, "
-                    "Avg generation throughput: "
-                    f"{avg_generation_throughput:.1f} tokens/s, "
-                    f"Running: {len(self.scheduler.running)} reqs, "
-                    f"Swapped: {len(self.scheduler.swapped)} reqs, "
-                    f"Pending: {len(self.scheduler.waiting)} reqs, "
-                    f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, "
-                    f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%")
-        self.last_logging_time = now
-
     def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None:
         """Decodes the new token for a sequence."""
         (new_tokens, new_output_text, prefix_offset,
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index c64071207f6a0..e613b9f551b2f 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,4 +1,19 @@
-from aioprometheus import Gauge
+from vllm.logger import init_logger
+from aioprometheus import Counter, Gauge, Histogram
+
+import time
+import numpy as np
+from typing import List
+from dataclasses import dataclass
+
+logger = init_logger(__name__)
+
+labels = {}
+
+
+def add_global_metrics_labels(**kwargs):
+    labels.update(kwargs)
+
 
 # The begin-* and end* here are used by the documentation generator
 # to extract the metrics definitions.
@@ -9,12 +24,16 @@
 gauge_avg_generation_throughput = Gauge(
     "vllm:avg_generation_throughput_toks_per_s",
     "Average generation throughput in tokens/s.")
+counter_prompt_tokens = Counter("vllm:prompt_tokens_total",
+                                "Number of prefill tokens processed.")
+counter_generation_tokens = Counter("vllm:generation_tokens_total",
+                                    "Number of generation tokens processed.")
 
 gauge_scheduler_running = Gauge(
     "vllm:num_requests_running",
-    "Number of requests that is currently running for inference.")
+    "Number of requests currently running on GPU.")
 gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped",
-                                "Number requests swapped to CPU.")
+                                "Number of requests swapped to CPU.")
 gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting",
                                 "Number of requests waiting to be processed.")
 
@@ -24,28 +43,131 @@
 gauge_cpu_cache_usage = Gauge(
     "vllm:cpu_cache_usage_perc",
     "CPU KV-cache usage. 1 means 100 percent usage.")
+
+histogram_time_to_first_token = Histogram(
+    "vllm:time_to_first_token_seconds",
+    "Histogram of time to first token in seconds.",
+    buckets=[
+        0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.75, 1.0,
+        2.5, 5.0, 7.5, 10.0
+    ])
+histogram_time_per_output_tokens = Histogram(
+    "vllm:time_per_output_token_seconds",
+    "Histogram of time per output token in seconds.",
+    buckets=[
+        0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5
+    ])
+histogram_e2e_request_latency = Histogram(
+    "vllm:e2e_request_latency_seconds",
+    "Histogram of end to end request latency in seconds.",
+    buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
 # end-metrics-definitions
 
-labels = {}
 
+@dataclass
+class Stats:
+    """Created by LLMEngine for use by StatLogger."""
+    now: float
 
-def add_global_metrics_labels(**kwargs):
-    labels.update(kwargs)
+    # System stats.
+    num_running: int
+    num_waiting: int
+    num_swapped: int
+    gpu_cache_usage: float
+    cpu_cache_usage: float
+
+    # Raw stats from last model iteration.
+    num_prompt_tokens: int
+    num_generation_tokens: int
+    time_to_first_tokens: List[float]
+    time_per_output_tokens: List[float]
+    time_e2e_requests: List[float]
+
+
+class StatLogger:
+    """StatLogger is used LLMEngine to log to Promethus and Stdout."""
+
+    def __init__(self, local_interval: float) -> None:
+        # Metadata for logging locally.
+        self.last_local_log = time.monotonic()
+        self.local_interval = local_interval
+
+        # Tracked stats over current local logging interval.
+        self.num_prompt_tokens: List[int] = []
+        self.num_generation_tokens: List[int] = []
+
+    def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
+        return float(np.sum(tracked_stats) / (now - self.last_local_log))
+
+    def _local_interval_elapsed(self, now: float) -> bool:
+        elapsed_time = now - self.last_local_log
+        return elapsed_time > self.local_interval
+
+    def _log_prometheus(self, stats: Stats) -> None:
+        # Set system stat gauges.
+        gauge_scheduler_running.set(labels, stats.num_running)
+        gauge_scheduler_swapped.set(labels, stats.num_swapped)
+        gauge_scheduler_waiting.set(labels, stats.num_waiting)
+        gauge_gpu_cache_usage.set(labels, stats.gpu_cache_usage)
+        gauge_cpu_cache_usage.set(labels, stats.cpu_cache_usage)
+
+        # Add to token counters.
+        counter_prompt_tokens.add(labels, stats.num_prompt_tokens)
+        counter_generation_tokens.add(labels, stats.num_generation_tokens)
+
+        # Observe request level latencies in histograms.
+        for ttft in stats.time_to_first_tokens:
+            histogram_time_to_first_token.observe(labels, ttft)
+        for tpot in stats.time_per_output_tokens:
+            histogram_time_per_output_tokens.observe(labels, tpot)
+        for e2e in stats.time_e2e_requests:
+            histogram_e2e_request_latency.observe(labels, e2e)
+
+    def _log_prometheus_interval(self, prompt_throughput: float,
+                                 generation_throughput: float) -> None:
+        # Logs metrics to prometheus that are computed every logging_interval.
+        # Support legacy gauge metrics that make throughput calculations on the vLLM side.
+        # Moving forward, we should use counters like counter_prompt_tokens, counter_generation_tokens
+        # Which log raw data and calculate summaries using rate() on the grafana/prometheus side.
+        # See https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
+        gauge_avg_prompt_throughput.set(labels, prompt_throughput)
+        gauge_avg_generation_throughput.set(labels, generation_throughput)
+
+    def log(self, stats: Stats) -> None:
+        """Called by LLMEngine.
+           Logs to prometheus and tracked stats every iteration. 
+           Logs to Stdout every self.local_interval seconds."""
+
+        # Log to prometheus.
+        self._log_prometheus(stats)
+
+        # Save tracked stats for token counters.
+        self.num_prompt_tokens.append(stats.num_prompt_tokens)
+        self.num_generation_tokens.append(stats.num_generation_tokens)
+
+        # Log locally every local_interval seconds.
+        if self._local_interval_elapsed(stats.now):
+
+            # Compute summary metrics for tracked stats (and log them to promethus if applicable).
+            prompt_throughput = self._get_throughput(self.num_prompt_tokens,
+                                                     now=stats.now)
+            generation_throughput = self._get_throughput(
+                self.num_generation_tokens, now=stats.now)
+            self._log_prometheus_interval(
+                prompt_throughput=prompt_throughput,
+                generation_throughput=generation_throughput)
 
+            # Log to stdout.
+            logger.info(
+                f"Avg prompt throughput: {prompt_throughput:.1f} tokens/s, "
+                f"Avg generation throughput: {generation_throughput:.1f} tokens/s, "
+                f"Running: {stats.num_running} reqs, "
+                f"Swapped: {stats.num_swapped} reqs, "
+                f"Pending: {stats.num_waiting} reqs, "
+                f"GPU KV cache usage: {stats.gpu_cache_usage * 100:.1f}%, "
+                f"CPU KV cache usage: {stats.cpu_cache_usage * 100:.1f}%")
 
-def record_metrics(
-    avg_prompt_throughput: float,
-    avg_generation_throughput: float,
-    scheduler_running: int,
-    scheduler_swapped: int,
-    scheduler_waiting: int,
-    gpu_cache_usage: float,
-    cpu_cache_usage: float,
-):
-    gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput)
-    gauge_avg_generation_throughput.set(labels, avg_generation_throughput)
-    gauge_scheduler_running.set(labels, scheduler_running)
-    gauge_scheduler_swapped.set(labels, scheduler_swapped)
-    gauge_scheduler_waiting.set(labels, scheduler_waiting)
-    gauge_gpu_cache_usage.set(labels, gpu_cache_usage)
-    gauge_cpu_cache_usage.set(labels, cpu_cache_usage)
+            # Reset tracked stats for next interval.
+            self.num_prompt_tokens = []
+            self.num_generation_tokens = []
+            self.last_local_log = stats.now
diff --git a/vllm/sequence.py b/vllm/sequence.py
index d28627f47498f..7b1c9a77a1e02 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -52,7 +52,6 @@ def get_finished_reason(status: "SequenceStatus") -> Union[str, None]:
 class SequenceData:
     """Data associated with a sequence.
 
-
     Args:
         prompt_token_ids: The token IDs of the prompt.
 
@@ -254,6 +253,7 @@ def __init__(
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
         self.sampling_params = sampling_params
         self.arrival_time = arrival_time
+        self.last_token_time = arrival_time
         self.lora_request = lora_request
         self.prefix: Optional[Prefix] = prefix
         self.prompt_logprobs: Optional[PromptLogprobs] = None
@@ -274,6 +274,12 @@ def prompt_token_ids(self) -> List[int]:
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
+    def get_last_latency(self, now: float) -> float:
+        """Gets last token latency for Request level timings."""
+        latency = now - self.last_token_time
+        self.last_token_time = now
+        return latency
+
     def get_max_num_running_seqs(self) -> int:
         """The maximum number of sequences running in parallel in the remaining
         lifetime of the request."""

From cd9e60c76c776c42431b7ae523fcfe7835546d74 Mon Sep 17 00:00:00 2001
From: Fengzhe Zhou <zfz-960727@163.com>
Date: Fri, 2 Feb 2024 01:27:40 +0800
Subject: [PATCH 028/112] Add Internlm2 (#2666)

---
 README.md                               |   1 +
 docs/source/models/supported_models.rst |   3 +
 vllm/model_executor/models/__init__.py  |   1 +
 vllm/model_executor/models/internlm2.py | 325 ++++++++++++++++++++++++
 4 files changed, 330 insertions(+)
 create mode 100644 vllm/model_executor/models/internlm2.py

diff --git a/README.md b/README.md
index 3853760613833..281ba3ca88679 100644
--- a/README.md
+++ b/README.md
@@ -73,6 +73,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
 - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
 - InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
+- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
 - LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
 - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
 - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 2663cf2366e64..a806aa4e29452 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -47,6 +47,9 @@ Alongside each architecture, we include some popular models that use it.
   * - :code:`InternLMForCausalLM`
     - InternLM
     - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
+  * - :code:`InternLM2ForCausalLM`
+    - InternLM2
+    - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc.
   * - :code:`LlamaForCausalLM`
     - LLaMA, LLaMA-2, Vicuna, Alpaca, Koala, Guanaco
     - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`young-geng/koala`, etc.
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index a26a513a60036..fb519b3c0cf92 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -25,6 +25,7 @@
     "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
     "InternLMForCausalLM": ("internlm", "InternLMForCausalLM"),
+    "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     # For decapoda-research/llama-*
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
new file mode 100644
index 0000000000000..ebf1d8a89a022
--- /dev/null
+++ b/vllm/model_executor/models/internlm2.py
@@ -0,0 +1,325 @@
+# -*- coding: utf-8 -*-
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.weight_utils import (default_weight_loader,
+                                              hf_model_weights_iterator)
+from vllm.sequence import SamplerOutput
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class InternLM2MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            linear_method=linear_method)
+        self.w2 = RowParallelLinear(intermediate_size,
+                                    hidden_size,
+                                    bias=False,
+                                    linear_method=linear_method)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.w2(x)
+        return x
+
+
+class InternLM2Attention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.wqkv = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.wo = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            linear_method=linear_method,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   self.scaling,
+                                   num_kv_heads=self.num_kv_heads)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.wqkv(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        output, _ = self.wo(attn_output)
+        return output
+
+
+class InternLMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.attention = InternLM2Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            linear_method=linear_method,
+        )
+        self.feed_forward = InternLM2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            linear_method=linear_method,
+        )
+        self.attention_norm = RMSNorm(config.hidden_size,
+                                      eps=config.rms_norm_eps)
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.attention_norm(hidden_states)
+        else:
+            hidden_states, residual = self.attention_norm(
+                hidden_states, residual)
+        hidden_states = self.attention(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.ffn_norm(hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+class InternLM2Model(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.tok_embeddings = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            InternLMDecoderLayer(config, linear_method)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.tok_embeddings(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class InternLM2ForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = InternLM2Model(config, linear_method)
+        self.output = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.output.weight, hidden_states,
+                                   sampling_metadata)
+        return next_tokens
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     load_format: str = "auto",
+                     revision: Optional[str] = None):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "w1", 0),
+            ("gate_up_proj", "w3", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, load_format, revision):
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                if "wqkv" in name:
+                    config = self.config
+                    kv_groups = config.num_attention_heads // config.num_key_value_heads
+                    head_dim = config.hidden_size // config.num_attention_heads
+                    loaded_weight = loaded_weight.view(-1, 2 + kv_groups,
+                                                       head_dim,
+                                                       loaded_weight.shape[-1])
+                    wq, wk, wv = torch.split(loaded_weight, [kv_groups, 1, 1],
+                                             dim=1)
+                    wq = wq.reshape(-1, wq.shape[-1])
+                    wk = wk.reshape(-1, wk.shape[-1])
+                    wv = wv.reshape(-1, wv.shape[-1])
+                    weight_loader = param.weight_loader
+                    weight_loader(param, wq, 'q')
+                    weight_loader(param, wk, 'k')
+                    weight_loader(param, wv, 'v')
+                else:
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)

From 923797fea4d80a4dac4409ece3c450b84d5ba001 Mon Sep 17 00:00:00 2001
From: zhaoyang-star <zhaoyangstar@foxmail.com>
Date: Fri, 2 Feb 2024 01:35:09 +0800
Subject: [PATCH 029/112] Fix compile error when using rocm (#2648)

---
 csrc/attention/attention_kernels.cu                | 2 ++
 csrc/cache_kernels.cu                              | 7 +++++++
 csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh | 1 -
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index a5ddeac740440..b5be3befa07e2 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -25,7 +25,9 @@
 
 #include "attention_dtypes.h"
 #include "attention_utils.cuh"
+#ifdef ENABLE_FP8_E5M2
 #include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh"
+#endif
 
 #include <algorithm>
 
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index fe0159e404585..ceb7347d94670 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -4,13 +4,20 @@
 
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
+#ifdef ENABLE_FP8_E5M2
 #include "quantization/fp8_e5m2_kvcache/quant_utils.cuh"
+#endif
 
 #include <algorithm>
 #include <cassert>
 #include <map>
 #include <vector>
 
+#ifdef USE_ROCM
+  #include <hip/hip_bf16.h>
+  typedef __hip_bfloat16 __nv_bfloat16;
+#endif
+
 void swap_blocks(
   torch::Tensor& src,
   torch::Tensor& dst,
diff --git a/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh b/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh
index c3b0d311b89cc..9bcab25db03cf 100644
--- a/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh
+++ b/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh
@@ -9,7 +9,6 @@
 #include "../../attention/dtype_float16.cuh"
 #include "../../attention/dtype_bfloat16.cuh"
 
-#pragma once
 
 namespace vllm {
 #ifdef ENABLE_FP8_E5M2

From b9e96b17de4c555e2249db9f6149b346232c000e Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 1 Feb 2024 14:00:58 -0800
Subject: [PATCH 030/112] fix python 3.8 syntax (#2716)

---
 Dockerfile                                    | 15 ++++++++++++++-
 vllm/entrypoints/openai/serving_completion.py | 14 +++++++-------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 364345d60e041..3db86adf19a91 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,8 +4,21 @@
 #################### BASE BUILD IMAGE ####################
 FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
 
+# Set the DEBIAN_FRONTEND variable to noninteractive to avoid interactive prompts
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Preconfigure tzdata for US Central Time (build running in us-central-1 but this really doesn't matter.)
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Chicago' | debconf-set-selections
+
+# We install an older version of python here for testing to make sure vllm works with older versions of Python.
+# For the actual openai compatible server, we will use the latest version of Python.
 RUN apt-get update -y \
-    && apt-get install -y python3-pip git
+    && apt-get install -y software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa -y \
+    && apt-get update -y \
+    && apt-get install -y python3.8 python3.8-dev python3.8-venv python3-pip git \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
 
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 8c9a7ad309cea..191142d222ea7 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -1,7 +1,7 @@
 import asyncio
 import time
 from fastapi import Request
-from typing import AsyncGenerator, AsyncIterator, Callable, List, Optional
+from typing import AsyncGenerator, AsyncIterator, Callable, List, Optional, Dict, Tuple
 from vllm.logger import init_logger
 from vllm.utils import random_uuid
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -19,8 +19,8 @@
 
 logger = init_logger(__name__)
 
-TypeTokenIDs = list[int]
-TypeTopLogProbs = List[Optional[dict[int, float]]]
+TypeTokenIDs = List[int]
+TypeTopLogProbs = List[Optional[Dict[int, float]]]
 TypeCreateLogProbsFn = Callable[
     [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], LogProbs]
 
@@ -29,7 +29,7 @@ async def completion_stream_generator(
     request: CompletionRequest,
     raw_request: Request,
     on_abort,
-    result_generator: AsyncIterator[tuple[int, RequestOutput]],
+    result_generator: AsyncIterator[Tuple[int, RequestOutput]],
     create_logprobs_fn: TypeCreateLogProbsFn,
     request_id: str,
     created_time: int,
@@ -126,7 +126,7 @@ async def completion_stream_generator(
     yield "data: [DONE]\n\n"
 
 
-def parse_prompt_format(prompt) -> tuple[bool, list]:
+def parse_prompt_format(prompt) -> Tuple[bool, list]:
     # get the prompt, openai supports the following
     # "a string, array of strings, array of tokens, or array of token arrays."
     prompt_is_tokens = False
@@ -151,7 +151,7 @@ def parse_prompt_format(prompt) -> tuple[bool, list]:
 
 
 def request_output_to_completion_response(
-    final_res_batch: list[RequestOutput],
+    final_res_batch: List[RequestOutput],
     request: CompletionRequest,
     create_logprobs_fn: TypeCreateLogProbsFn,
     request_id: str,
@@ -302,7 +302,7 @@ async def create_completion(self, request: CompletionRequest,
         except ValueError as e:
             return self.create_error_response(str(e))
 
-        result_generator: AsyncIterator[tuple[
+        result_generator: AsyncIterator[Tuple[
             int, RequestOutput]] = merge_async_iterators(*generators)
 
         # Similar to the OpenAI API, when n != best_of, we do not stream the

From bb8c697ee0f01aaeddce31bd5fba3e9f7f4488a1 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 1 Feb 2024 14:56:53 -0800
Subject: [PATCH 031/112] Update README for meetup slides (#2718)

---
 README.md | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 281ba3ca88679..c48ddcfa0a79a 100644
--- a/README.md
+++ b/README.md
@@ -16,16 +16,8 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
-**The Second vLLM Bay Area Meetup (Jan 31st 5pm-7:30pm PT)**
-
-We are thrilled to announce our second vLLM Meetup!
-The vLLM team will share recent updates and roadmap.
-We will also have vLLM collaborators from IBM coming up to the stage to discuss their insights on LLM optimizations.
-Please register [here](https://lu.ma/ygxbpzhl) and join us!
-
----
-
 *Latest News* 🔥
+- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
 - [2024/01] Added ROCm 6.0 support to vLLM.
 - [2023/12] Added ROCm 5.7 support to vLLM.
 - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).

From c410f5d020216df2dfedde52bcae24ae7f0ac7ec Mon Sep 17 00:00:00 2001
From: Pernekhan Utemuratov <bestkhang@gmail.com>
Date: Thu, 1 Feb 2024 15:41:58 -0800
Subject: [PATCH 032/112] Use revision when downloading the quantization config
 file (#2697)

Co-authored-by: Pernekhan Utemuratov <pernekhan@deepinfra.com>
---
 vllm/model_executor/model_loader.py |  5 +----
 vllm/model_executor/weight_utils.py | 29 ++++++++++++++---------------
 2 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
index bf13ebf57d422..cd21c7788fc7d 100644
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@@ -44,10 +44,7 @@ def get_model(model_config: ModelConfig,
     # Get the (maybe quantized) linear method.
     linear_method = None
     if model_config.quantization is not None:
-        quant_config = get_quant_config(model_config.quantization,
-                                        model_config.model,
-                                        model_config.hf_config,
-                                        model_config.download_dir)
+        quant_config = get_quant_config(model_config)
         capability = torch.cuda.get_device_capability()
         capability = capability[0] * 10 + capability[1]
         if capability < quant_config.get_min_capability():
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index 8e6f7a174f219..3570366887e78 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -11,9 +11,9 @@
 import numpy as np
 from safetensors.torch import load_file, save_file, safe_open
 import torch
-from transformers import PretrainedConfig
 from tqdm.auto import tqdm
 
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (get_quantization_config,
                                                      QuantizationConfig)
@@ -83,25 +83,22 @@ def convert_bin_to_safetensor_file(
 
 
 # TODO(woosuk): Move this to other place.
-def get_quant_config(
-    quantization: str,
-    model_name_or_path: str,
-    hf_config: PretrainedConfig,
-    cache_dir: Optional[str] = None,
-) -> QuantizationConfig:
-    quant_cls = get_quantization_config(quantization)
+def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:
+    quant_cls = get_quantization_config(model_config.quantization)
     # Read the quantization config from the HF model config, if available.
-    hf_quant_config = getattr(hf_config, "quantization_config", None)
+    hf_quant_config = getattr(model_config.hf_config, "quantization_config",
+                              None)
     if hf_quant_config is not None:
         return quant_cls.from_config(hf_quant_config)
-
+    model_name_or_path = model_config.model
     is_local = os.path.isdir(model_name_or_path)
     if not is_local:
         # Download the config files.
-        with get_lock(model_name_or_path, cache_dir):
+        with get_lock(model_name_or_path, model_config.download_dir):
             hf_folder = snapshot_download(model_name_or_path,
+                                          revision=model_config.revision,
                                           allow_patterns="*.json",
-                                          cache_dir=cache_dir,
+                                          cache_dir=model_config.download_dir,
                                           tqdm_class=Disabledtqdm)
     else:
         hf_folder = model_name_or_path
@@ -112,10 +109,12 @@ def get_quant_config(
             f.endswith(x) for x in quant_cls.get_config_filenames())
     ]
     if len(quant_config_files) == 0:
-        raise ValueError(f"Cannot find the config file for {quantization}")
+        raise ValueError(
+            f"Cannot find the config file for {model_config.quantization}")
     if len(quant_config_files) > 1:
-        raise ValueError(f"Found multiple config files for {quantization}: "
-                         f"{quant_config_files}")
+        raise ValueError(
+            f"Found multiple config files for {model_config.quantization}: "
+            f"{quant_config_files}")
 
     quant_config_file = quant_config_files[0]
     with open(quant_config_file, "r") as f:

From 96b6f475dda40a0c7d557f73c36fe09c07be2e9c Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Fri, 2 Feb 2024 07:46:39 +0800
Subject: [PATCH 033/112] Remove hardcoded `device="cuda" ` to support more
 devices (#2503)

Co-authored-by: Jiang Li <jiang1.li@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
---
 benchmarks/benchmark_latency.py               |  7 ++
 benchmarks/benchmark_throughput.py            | 10 ++-
 .../kernels/benchmark_paged_attention.py      | 27 ++++---
 tests/kernels/test_activation.py              | 37 +++++----
 tests/kernels/test_attention.py               | 51 ++++++-------
 tests/kernels/test_cache.py                   | 42 +++++------
 tests/kernels/test_layernorm.py               | 17 +++--
 tests/kernels/test_pos_encoding.py            | 22 +++---
 tests/kernels/test_prefix_prefill.py          | 57 +++++---------
 tests/lora/conftest.py                        |  4 +-
 tests/lora/test_layers.py                     | 36 +++++----
 tests/lora/test_worker.py                     |  4 +-
 tests/samplers/test_rejection_sampler.py      | 64 +++++++---------
 tests/samplers/test_sampler.py                | 37 +++++----
 tests/worker/spec_decode/utils.py             |  3 +-
 tests/worker/test_model_runner.py             |  2 +-
 vllm/config.py                                |  6 ++
 vllm/engine/arg_utils.py                      | 25 +++++--
 vllm/engine/llm_engine.py                     | 12 ++-
 vllm/model_executor/layers/activation.py      |  4 +-
 vllm/model_executor/layers/attention.py       |  2 +-
 vllm/model_executor/layers/linear.py          | 10 +--
 .../model_executor/layers/quantization/awq.py |  3 -
 .../layers/quantization/gptq.py               |  4 -
 .../layers/quantization/squeezellm.py         |  6 +-
 .../model_executor/layers/rotary_embedding.py | 22 +++---
 .../layers/vocab_parallel_embedding.py        |  2 -
 vllm/model_executor/model_loader.py           |  5 +-
 vllm/utils.py                                 |  3 +-
 vllm/worker/cache_engine.py                   |  2 +
 vllm/worker/model_runner.py                   | 75 ++++++++++++-------
 vllm/worker/worker.py                         | 40 +++++-----
 32 files changed, 346 insertions(+), 295 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 7173134358762..2eb9e2cb8e4d5 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -25,6 +25,7 @@ def main(args: argparse.Namespace):
         dtype=args.dtype,
         enforce_eager=args.enforce_eager,
         kv_cache_dtype=args.kv_cache_dtype,
+        device=args.device,
     )
 
     sampling_params = SamplingParams(
@@ -135,5 +136,11 @@ def run_to_completion(profile_dir: Optional[str] = None):
         default=None,
         help=('path to save the pytorch profiler output. Can be visualized '
               'with ui.perfetto.dev or Tensorboard.'))
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda"],
+        help='device type for vLLM execution, supporting CUDA only currently.')
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index d45d33307c912..1ad502526c97c 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -72,6 +72,7 @@ def run_vllm(
     max_model_len: Optional[int],
     enforce_eager: bool,
     kv_cache_dtype: str,
+    device: str,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(
@@ -85,6 +86,7 @@ def run_vllm(
         max_model_len=max_model_len,
         enforce_eager=enforce_eager,
         kv_cache_dtype=kv_cache_dtype,
+        device=device,
     )
 
     # Add the requests to the engine.
@@ -209,7 +211,7 @@ def main(args: argparse.Namespace):
                                 args.seed, args.n, args.use_beam_search,
                                 args.trust_remote_code, args.dtype,
                                 args.max_model_len, args.enforce_eager,
-                                args.kv_cache_dtype)
+                                args.kv_cache_dtype, args.device)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -294,6 +296,12 @@ def main(args: argparse.Namespace):
         default="auto",
         help=
         'Data type for kv cache storage. If "auto", will use model data type.')
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda"],
+        help='device type for vLLM execution, supporting CUDA only currently.')
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 56fe1b921d44e..d921dea1220e1 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -25,18 +25,20 @@ def main(
     dtype: torch.dtype,
     seed: int,
     do_profile: bool,
+    device: str = "cuda",
     kv_cache_dtype: Optional[str] = None,
 ) -> None:
     random.seed(seed)
     torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
 
     scale = float(1.0 / (head_size**0.5))
     query = torch.empty(num_seqs,
                         num_query_heads,
                         head_size,
                         dtype=dtype,
-                        device="cuda")
+                        device=device)
     query.uniform_(-scale, scale)
 
     assert num_query_heads % num_kv_heads == 0
@@ -44,11 +46,11 @@ def main(
     if use_alibi:
         alibi_slopes = torch.randn(num_query_heads,
                                    dtype=torch.float,
-                                   device="cuda")
+                                   device=device)
 
     context_lens = [context_len for _ in range(num_seqs)]
     max_context_len = max(context_lens)
-    context_lens = torch.tensor(context_lens, dtype=torch.int, device="cuda")
+    context_lens = torch.tensor(context_lens, dtype=torch.int, device=device)
 
     # Create the block tables.
     max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
@@ -59,12 +61,17 @@ def main(
             for _ in range(max_num_blocks_per_seq)
         ]
         block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int, device="cuda")
+    block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
 
     # Create the KV cache.
-    key_caches, value_caches = create_kv_caches_with_random(
-        NUM_BLOCKS, block_size, 1, num_kv_heads, head_size, kv_cache_dtype,
-        dtype)
+    key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
+                                                            block_size,
+                                                            1,
+                                                            num_kv_heads,
+                                                            head_size,
+                                                            kv_cache_dtype,
+                                                            dtype,
+                                                            device=device)
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Prepare for the paged attention kernel.
@@ -84,7 +91,7 @@ def main(
         )
         max_logits = torch.empty_like(exp_sums)
 
-    def run_benchmark(num_iters: int, profile: bool = False) -> float:
+    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
         torch.cuda.synchronize()
         if profile:
             torch.cuda.cudart().cudaProfilerStart()
@@ -135,6 +142,7 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float:
 
     # Warmup.
     print("Warming up...")
+    run_benchmark = run_cuda_benchmark
     run_benchmark(num_iters=3, profile=False)
 
     # Benchmark.
@@ -175,6 +183,7 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float:
         default="auto",
         help=
         'Data type for kv cache storage. If "auto", will use model data type.')
+    parser.add_argument("--device", type=str, choices=["cuda"], default="cuda")
     args = parser.parse_args()
     print(args)
 
diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index 826bf8350af17..de0b497057269 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -7,26 +7,29 @@
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
 SEEDS = [0]
-DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_silu_and_mul(
     num_tokens: int,
     d: int,
     dtype: torch.dtype,
     seed: int,
-    device: int,
+    device: str,
 ) -> None:
     torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    gpu_id = f"cuda:{device}"
-    x = torch.randn(num_tokens, 2 * d, dtype=dtype, device=gpu_id)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    x = torch.randn(num_tokens, 2 * d, dtype=dtype)
     layer = SiluAndMul()
     out = layer(x)
     ref_out = layer._forward(x)
@@ -37,19 +40,20 @@ def test_silu_and_mul(
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_gelu_new(
     num_tokens: int,
     d: int,
     dtype: torch.dtype,
     seed: int,
-    device: int,
+    device: str,
 ) -> None:
     torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    gpu_id = f"cuda:{device}"
-    x = torch.randn(num_tokens, d, dtype=dtype, device=gpu_id)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    x = torch.randn(num_tokens, d, dtype=dtype)
     layer = NewGELU()
     out = layer(x)
     ref_out = layer._forward(x)
@@ -60,18 +64,19 @@ def test_gelu_new(
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_gelu_fast(
     num_tokens: int,
     d: int,
     dtype: torch.dtype,
     seed: int,
-    device: int,
+    device: str,
 ) -> None:
     torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    gpu_id = f"cuda:{device}"
-    x = torch.randn(num_tokens, d, dtype=dtype, device=gpu_id)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    x = torch.randn(num_tokens, d, dtype=dtype)
     layer = FastGELU()
     out = layer(x)
     ref_out = layer._forward(x)
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index cbb1d40623c71..92d63eb6c63e2 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -27,7 +27,9 @@
 USE_ALIBI = [False, True]
 KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
 SEEDS = [0]
-DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 
 
 def ref_masked_attention(
@@ -91,7 +93,7 @@ def ref_single_query_cached_kv_attention(
         alibi_bias = None
         if alibi_slopes is not None:
             # Create the ALiBi bias used in the paged attention kernel.
-            position_ids = torch.arange(context_len, device=query.device).int()
+            position_ids = torch.arange(context_len).int()
             alibi_bias = (position_ids - context_len + 1).float()
             alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
                 1, 1, -1)
@@ -110,7 +112,7 @@ def ref_single_query_cached_kv_attention(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_paged_attention(
     kv_cache_factory,
     version: str,
@@ -122,33 +124,28 @@ def test_paged_attention(
     dtype: torch.dtype,
     kv_cache_dtype: str,
     seed: int,
-    device: int,
+    device: str,
 ) -> None:
     random.seed(seed)
     torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    gpu_id = f"cuda:{device}"
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
     scale = float(1.0 / (head_size**0.5))
     num_query_heads, num_kv_heads = num_heads
-    query = torch.empty(num_seqs,
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype,
-                        device=gpu_id)
+    query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
     query.uniform_(-scale, scale)
 
     assert num_query_heads % num_kv_heads == 0
     num_queries_per_kv = num_query_heads // num_kv_heads
     alibi_slopes = None
     if use_alibi:
-        alibi_slopes = torch.randn(num_query_heads,
-                                   dtype=torch.float,
-                                   device=gpu_id)
+        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
 
     context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
     context_lens[-1] = MAX_SEQ_LEN
     max_context_len = max(context_lens)
-    context_lens = torch.tensor(context_lens, dtype=torch.int, device=gpu_id)
+    context_lens = torch.tensor(context_lens, dtype=torch.int)
 
     # Create the block tables.
     max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
@@ -159,13 +156,13 @@ def test_paged_attention(
             for _ in range(max_num_blocks_per_seq)
         ]
         block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int, device=gpu_id)
+    block_tables = torch.tensor(block_tables, dtype=torch.int)
 
     # Create the KV caches.
     key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
                                                 num_kv_heads, head_size,
                                                 kv_cache_dtype, dtype, seed,
-                                                gpu_id)
+                                                device)
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Call the paged attention kernel.
@@ -193,12 +190,10 @@ def test_paged_attention(
         tmp_output = torch.empty(
             size=(num_seqs, num_heads, num_partitions, head_size),
             dtype=output.dtype,
-            device=output.device,
         )
         exp_sums = torch.empty(
             size=(num_seqs, num_heads, num_partitions),
             dtype=torch.float32,
-            device=output.device,
         )
         max_logits = torch.empty_like(exp_sums)
         ops.paged_attention_v2(
@@ -229,14 +224,14 @@ def test_paged_attention(
                            block_size, x)
         dequantized_key_cache = torch.empty(size=key_cache_shape,
                                             dtype=dtype,
-                                            device=gpu_id)
+                                            device=device)
         cache_ops.convert_fp8_e5m2(key_cache, dequantized_key_cache)
         key_cache = dequantized_key_cache
 
         value_cache_shape = value_cache.shape
         dequantized_value_cache = torch.empty(size=value_cache_shape,
                                               dtype=dtype,
-                                              device=gpu_id)
+                                              device=device)
         cache_ops.convert_fp8_e5m2(value_cache, dequantized_value_cache)
         value_cache = dequantized_value_cache
 
@@ -283,7 +278,7 @@ def ref_multi_query_kv_attention(
         attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
                                diagonal=1)
         attn_mask = attn_mask * torch.finfo(dtype).min
-        attn_mask = attn_mask.to(dtype=dtype, device=query.device)
+        attn_mask = attn_mask.to(dtype=dtype)
 
         ref_output = ref_masked_attention(
             query[start_idx:end_idx],
@@ -303,7 +298,7 @@ def ref_multi_query_kv_attention(
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_multi_query_kv_attention(
     num_seqs: int,
@@ -311,12 +306,13 @@ def test_multi_query_kv_attention(
     head_size: int,
     dtype: torch.dtype,
     seed: int,
-    device: int,
+    device: str,
 ) -> None:
     random.seed(seed)
     torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    gpu_id = f"cuda:{device}"
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
     # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
     # As the xformers library is already tested with its own tests, we can use
     # a smaller MAX_SEQ_LEN here.
@@ -329,8 +325,7 @@ def test_multi_query_kv_attention(
     qkv = torch.empty(num_tokens,
                       num_query_heads + 2 * num_kv_heads,
                       head_size,
-                      dtype=dtype,
-                      device=gpu_id)
+                      dtype=dtype)
     qkv.uniform_(-scale, scale)
     query, key, value = qkv.split(
         [num_query_heads, num_kv_heads, num_kv_heads], dim=1)
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 275ef8194d0bd..a90492f53eee6 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -17,7 +17,9 @@
 NUM_BLOCKS = [1024, 3600]  # Arbitrary values for testing
 NUM_MAPPINGS = [256]  # Arbitrary values for testing
 SEEDS = [0]
-DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
 
 
@@ -29,7 +31,7 @@
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @torch.inference_mode()
 def test_copy_blocks(
@@ -42,13 +44,14 @@ def test_copy_blocks(
     num_blocks: int,
     dtype: torch.dtype,
     seed: int,
-    device: int,
     kv_cache_dtype: str,
+    device: str,
 ) -> None:
     random.seed(seed)
     torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    gpu_id = f"cuda:{device}"
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
     # Generate random block mappings where each source block is mapped to two
     # destination blocks.
     assert 2 * num_mappings <= num_blocks
@@ -66,7 +69,7 @@ def test_copy_blocks(
     key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
                                                 num_layers, num_heads,
                                                 head_size, kv_cache_dtype,
-                                                dtype, seed, gpu_id)
+                                                dtype, seed, device)
 
     # Clone the KV caches.
     cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
@@ -98,7 +101,7 @@ def test_copy_blocks(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_reshape_and_cache(
     kv_cache_factory,
@@ -109,29 +112,25 @@ def test_reshape_and_cache(
     num_blocks: int,
     dtype: torch.dtype,
     seed: int,
-    device: int,
+    device: str,
 ) -> None:
     random.seed(seed)
     torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    gpu_id = f"cuda:{device}"
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
     # Create a random slot mapping.
     num_slots = block_size * num_blocks
     slot_mapping = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=gpu_id)
-
-    qkv = torch.randn(num_tokens,
-                      3,
-                      num_heads,
-                      head_size,
-                      dtype=dtype,
-                      device=gpu_id)
+    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long)
+
+    qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
     _, key, value = qkv.unbind(dim=1)
 
     # Create the KV caches.
     key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
                                                 num_heads, head_size, dtype,
-                                                None, seed, gpu_id)
+                                                None, seed, device)
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Clone the KV caches.
@@ -166,7 +165,7 @@ def test_reshape_and_cache(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_swap_blocks(
     kv_cache_factory,
@@ -182,7 +181,8 @@ def test_swap_blocks(
 ) -> None:
     random.seed(seed)
     torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
     src_device = f"{direction[0]}:{device}" if direction[
         0] == "cuda" else direction[0]
     dst_device = f"{direction[1]}:{device}" if direction[
diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py
index 8a06b3aa268be..b1e3c1a7f07f5 100644
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -8,7 +8,9 @@
 HIDDEN_SIZES = [768, 5120, 8192]  # Arbitrary values for testing
 ADD_RESIDUAL = [False, True]
 SEEDS = [0]
-DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -16,7 +18,7 @@
 @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_rms_norm(
     num_tokens: int,
@@ -24,15 +26,16 @@ def test_rms_norm(
     add_residual: bool,
     dtype: torch.dtype,
     seed: int,
-    device: int,
+    device: str,
 ) -> None:
     torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    gpu_id = f"cuda:{device}"
-    layer = RMSNorm(hidden_size).to(dtype=dtype, device=gpu_id)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    layer = RMSNorm(hidden_size).to(dtype=dtype)
     layer.weight.data.normal_(mean=1.0, std=0.1)
     scale = 1 / (2 * hidden_size)
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=gpu_id)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
     x *= scale
     residual = torch.randn_like(x) * scale if add_residual else None
 
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index aad310e2bc6d2..19cbd600e838f 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -13,7 +13,9 @@
 BATCH_SIZES = [1, 5]  # Arbitrary values for testing
 SEQ_LENS = [11, 8192]  # Arbitrary values for testing
 SEEDS = [0]
-DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 
 
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
@@ -24,7 +26,7 @@
 @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_rotary_embedding(
     is_neox_style: bool,
@@ -35,28 +37,26 @@ def test_rotary_embedding(
     rotary_dim: Optional[int],
     dtype: torch.dtype,
     seed: int,
-    device: int,
+    device: str,
     max_position: int = 8192,
     base: int = 10000,
 ) -> None:
     if rotary_dim is None:
         rotary_dim = head_size
     torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    gpu_id = f"cuda:{device}"
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
     if rotary_dim is None:
         rotary_dim = head_size
     rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
-    rope = rope.to(dtype=dtype, device=gpu_id)
+    rope = rope.to(dtype=dtype)
 
-    positions = torch.randint(0,
-                              max_position, (batch_size, seq_len),
-                              device=gpu_id)
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
     query = torch.randn(batch_size,
                         seq_len,
                         num_heads * head_size,
-                        dtype=dtype,
-                        device=gpu_id)
+                        dtype=dtype)
     key = torch.randn_like(query)
 
     # NOTE(woosuk): The reference implementation should be executed first
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index 0531b05135fb9..ac93b32588cca 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -11,19 +11,27 @@
 NUM_HEADS = [12]
 HEAD_SIZES = [128]
 DTYPES = [torch.float16]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 
 
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_contexted_kv_attention(
     num_heads: int,
     head_size: int,
     dtype: torch.dtype,
+    device: str,
 ) -> None:
     random.seed(0)
     torch.manual_seed(0)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(0)
+    torch.set_default_device(device)
     MAX_SEQ_LEN = 1024
     MAX_CTX_LEN = 1024
     BS = 10
@@ -35,24 +43,11 @@ def test_contexted_kv_attention(
     seq_lens = [a + b for a, b in zip(subquery_lens, ctx_lens)]
 
     num_tokens = sum(subquery_lens)
-    query = torch.empty(num_tokens,
-                        num_heads,
-                        head_size,
-                        dtype=dtype,
-                        device='cuda')
+    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
     query.uniform_(-1e-3, 1e-3)
-    output = torch.empty(num_tokens,
-                         num_heads,
-                         head_size,
-                         dtype=dtype,
-                         device='cuda')
+    output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
 
-    kv = torch.empty(sum(seq_lens),
-                     2,
-                     num_heads,
-                     head_size,
-                     dtype=dtype,
-                     device='cuda')
+    kv = torch.empty(sum(seq_lens), 2, num_heads, head_size, dtype=dtype)
     kv.uniform_(-1e-3, 1e-3)
     key, value = kv.unbind(dim=1)
 
@@ -60,39 +55,27 @@ def test_contexted_kv_attention(
                           block_size,
                           num_heads,
                           head_size,
-                          dtype=dtype,
-                          device='cuda')
+                          dtype=dtype)
     v_cache = torch.zeros(cache_size,
                           block_size,
                           num_heads,
                           head_size,
-                          dtype=dtype,
-                          device='cuda')
-    k = torch.zeros(sum(subquery_lens),
-                    num_heads,
-                    head_size,
-                    dtype=dtype,
-                    device='cuda')
-    v = torch.zeros(sum(subquery_lens),
-                    num_heads,
-                    head_size,
-                    dtype=dtype,
-                    device='cuda')
-    values = torch.arange(0, cache_size, dtype=torch.long, device='cuda')
+                          dtype=dtype)
+    k = torch.zeros(sum(subquery_lens), num_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(subquery_lens), num_heads, head_size, dtype=dtype)
+    values = torch.arange(0, cache_size, dtype=torch.long)
     values = values[torch.randperm(cache_size)]
     block_table = values[:BS * max_block_per_request].view(
         BS, max_block_per_request)
-    b_seq_len = torch.tensor(seq_lens, dtype=torch.long, device='cuda')
-    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long, device='cuda')
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
     b_start_loc = torch.cumsum(torch.tensor([0] + subquery_lens[:-1],
-                                            dtype=torch.long,
-                                            device='cuda'),
+                                            dtype=torch.long),
                                dim=0)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
     b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
-                                                dtype=torch.long,
-                                                device='cuda'),
+                                                dtype=torch.long),
                                    dim=0)
     for i in range(BS):
         for j in range(subquery_lens[i]):
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index c1b3d04c713b5..163c3c70261c0 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -126,8 +126,8 @@ def llama_2_7b_engine_extra_embeddings() -> nn.Module:
     cleanup()
     get_model_old = get_model
 
-    def get_model_patched(model_config, lora_config=None):
-        return get_model_old(model_config,
+    def get_model_patched(model_config, device_config, lora_config=None):
+        return get_model_old(model_config, device_config,
                              LoRAConfig(max_loras=4, max_lora_rank=8))
 
     with patch("vllm.worker.model_runner.get_model", get_model_patched):
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 71c671132205a..f739bbeaab334 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -34,6 +34,9 @@
     torch.float32: (5e-3, 5e-3),
     torch.bfloat16: (3e-2, 2e-2),
 }
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 
 
 def get_random_id_to_index(num_loras: int,
@@ -151,14 +154,10 @@ def create_random_inputs(
     for _ in range(num_inputs):
         if input_type == torch.int:
             inputs.append(
-                torch.randint(low=int(low),
-                              high=int(high),
-                              size=input_size,
-                              device="cuda"))
+                torch.randint(low=int(low), high=int(high), size=input_size))
         else:
             inputs.append(
-                torch.rand(size=input_size, dtype=input_type, device="cuda") *
-                high + low)
+                torch.rand(size=input_size, dtype=input_type) * high + low)
 
         lora_id = random.choice(active_lora_ids)
         index_mapping += [lora_id] * input_size[0]
@@ -169,8 +168,10 @@ def create_random_inputs(
 
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
-def test_embeddings(dist_init, num_loras) -> None:
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_embeddings(dist_init, num_loras, device) -> None:
 
+    torch.set_default_device(device)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -259,8 +260,10 @@ def create_random_embedding_layer():
 @torch.inference_mode()
 # @pytest.mark.skip(reason="Fails when loras are in any slot other than the first.")
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
-def test_embeddings_with_new_embeddings(dist_init, num_loras) -> None:
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
 
+    torch.set_default_device(device)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -305,8 +308,7 @@ def create_random_embedding_layer():
 
         # Add empty embeddings_tensors for unoccupied lora slots.
         for _ in range(max_loras - len(embeddings_tensors)):
-            embeddings_tensors.append(
-                torch.zeros(embeddings_tensors[0].shape, device="cuda"))
+            embeddings_tensors.append(torch.zeros(embeddings_tensors[0].shape))
 
         inputs, index_mapping, prompt_mapping = create_random_inputs(
             active_lora_ids=list(lora_dict.keys()),
@@ -388,8 +390,10 @@ def create_random_embedding_layer():
 
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
-def test_lm_head_sampler(dist_init, num_loras) -> None:
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_lm_head_sampler(dist_init, num_loras, device) -> None:
 
+    torch.set_default_device(device)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -432,7 +436,7 @@ def create_random_sampler_layer():
         )
         lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
 
-        input_ = torch.rand(20, 1024, device="cuda")
+        input_ = torch.rand(20, 1024)
         mapping_info = convert_mapping(
             lora_mapping,
             id_to_index,
@@ -500,8 +504,10 @@ def create_random_sampler_layer():
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("orientation", ["row", "column"])
-def test_linear_parallel(dist_init, num_loras, orientation) -> None:
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
 
+    torch.set_default_device(device)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -597,8 +603,10 @@ def create_random_linear_parallel_layer():
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("repeats", [2, 3])
-def test_column_parallel_packed(dist_init, num_loras, repeats) -> None:
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
 
+    torch.set_default_device(device)
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 68c2c0b5fc134..31a7c716afbf2 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -5,7 +5,8 @@
 
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
-from vllm.config import ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig
+from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
+                         DeviceConfig, LoRAConfig)
 from vllm.worker.worker import Worker
 
 
@@ -25,6 +26,7 @@ def test_worker_apply_lora(sql_lora_files):
         ),
         parallel_config=ParallelConfig(1, 1, False),
         scheduler_config=SchedulerConfig(32, 32, 32, 256),
+        device_config=DeviceConfig("cuda"),
         local_rank=0,
         rank=0,
         lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 9d3ef3c67d3dc..99ee78ce49824 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -9,6 +9,10 @@
 
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
 
 def mock_causal_accepted_tensor(
         k: int, last_accepted_indices: torch.Tensor) -> torch.Tensor:
@@ -39,11 +43,14 @@ def mock_causal_accepted_tensor(
 @pytest.mark.parametrize(
     "which_tokens_accepted",
     ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_correct_output_format(which_tokens_accepted: str, seed: int):
+def test_correct_output_format(which_tokens_accepted: str, seed: int,
+                               device: str):
     """Verify the output has correct format given predetermined accepted matrix.
     """
     set_random_seed(seed)
+    torch.set_default_device(device)
 
     batch_size = 10
     k = 5
@@ -66,18 +73,15 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int):
     recovered_token_ids = torch.randint(low=0,
                                         high=vocab_size,
                                         size=(batch_size, k),
-                                        dtype=torch.int64,
-                                        device="cuda")
+                                        dtype=torch.int64)
     draft_token_ids = torch.randint(low=0,
                                     high=vocab_size,
                                     size=(batch_size, k),
-                                    dtype=torch.int64,
-                                    device="cuda")
+                                    dtype=torch.int64)
     bonus_token_ids = torch.randint(low=0,
                                     high=vocab_size,
                                     size=(batch_size, 1),
-                                    dtype=torch.int64,
-                                    device="cuda")
+                                    dtype=torch.int64)
 
     rejection_sampler = RejectionSampler()
     rejection_sampler.init_gpu_tensors(rank=0)
@@ -120,31 +124,24 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int):
 @pytest.mark.parametrize("k", list(range(1, 6)))
 @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
 @pytest.mark.parametrize("batch_size", list(range(1, 32)))
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int):
+def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
+                                    device: str):
+    torch.set_default_device(device)
     rejection_sampler = RejectionSampler()
     rejection_sampler.init_gpu_tensors(rank=0)
 
-    draft_probs = torch.rand(batch_size,
-                             k,
-                             vocab_size,
-                             dtype=torch.float32,
-                             device="cuda")
-    target_probs = torch.rand(batch_size,
-                              k,
-                              vocab_size,
-                              dtype=torch.float32,
-                              device="cuda")
+    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
     bonus_token_ids = torch.randint(low=0,
                                     high=vocab_size,
                                     size=(batch_size, 1),
-                                    dtype=torch.int64,
-                                    device="cuda")
+                                    dtype=torch.int64)
     draft_token_ids = torch.randint(low=0,
                                     high=vocab_size,
                                     size=(batch_size, k),
-                                    dtype=torch.int64,
-                                    device="cuda")
+                                    dtype=torch.int64)
 
     rejection_sampler(target_probs, bonus_token_ids, draft_probs,
                       draft_token_ids)
@@ -153,36 +150,28 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int):
 @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
 @pytest.mark.parametrize("which_token_ids",
                          ["bonus_token_ids", "draft_token_ids"])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
-                               which_token_ids: str):
+                               which_token_ids: str, device: str):
     k = 3
     batch_size = 5
     vocab_size = 30_000
+    torch.set_default_device(device)
 
     rejection_sampler = RejectionSampler(strict_mode=True)
     rejection_sampler.init_gpu_tensors(rank=0)
 
-    draft_probs = torch.rand(batch_size,
-                             k,
-                             vocab_size,
-                             dtype=torch.float32,
-                             device="cuda")
-    target_probs = torch.rand(batch_size,
-                              k,
-                              vocab_size,
-                              dtype=torch.float32,
-                              device="cuda")
+    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
     bonus_token_ids = torch.randint(low=0,
                                     high=vocab_size,
                                     size=(batch_size, 1),
-                                    dtype=torch.int64,
-                                    device="cuda")
+                                    dtype=torch.int64)
     draft_token_ids = torch.randint(low=0,
                                     high=vocab_size,
                                     size=(batch_size, k),
-                                    dtype=torch.int64,
-                                    device="cuda")
+                                    dtype=torch.int64)
 
     oob_token_ids = None
     if which_token_ids == "bonus_token_ids":
@@ -237,6 +226,7 @@ def test_rejection_sampling_approximates_target_distribution(
     probabilities are exactly equal. Rejection sampling should
     still work without any NaNs or exceptions.
     """
+    torch.set_default_device("cpu")
     set_random_seed(seed)
 
     helper = _CorrectnessTestHelper(
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 962183a29fbfa..d34f32d03fee0 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -31,24 +31,26 @@ def _prepare_test(
     batch_size: int
 ) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, ModelRunner]:
     vocab_size = 32000
-    input_tensor = torch.rand((batch_size, 1024),
-                              device="cuda",
-                              dtype=torch.float16)
+    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
     fake_logits = torch.full((batch_size, vocab_size),
                              1e-2,
-                             device=input_tensor.device,
                              dtype=input_tensor.dtype)
     sampler = MockLogitsSampler(32000, fake_logits)
-    model_runner = ModelRunner(None, None, None, None)
+    model_runner = ModelRunner(None, None, None, None, None)
     return input_tensor, fake_logits, sampler, model_runner
 
 
 RANDOM_SEEDS = list(range(128))
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_sampler_all_greedy(seed: int):
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_greedy(seed: int, device: str):
     set_random_seed(seed)
+    torch.set_default_device(device)
     batch_size = random.randint(1, 256)
     input_tensor, fake_logits, sampler, model_runner = _prepare_test(
         batch_size)
@@ -81,8 +83,10 @@ def test_sampler_all_greedy(seed: int):
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_sampler_all_random(seed: int):
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random(seed: int, device: str):
     set_random_seed(seed)
+    torch.set_default_device(device)
     batch_size = random.randint(1, 256)
     input_tensor, fake_logits, sampler, model_runner = _prepare_test(
         batch_size)
@@ -120,8 +124,10 @@ def test_sampler_all_random(seed: int):
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_sampler_all_beam(seed: int):
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_beam(seed: int, device: str):
     set_random_seed(seed)
+    torch.set_default_device(device)
     batch_size = random.randint(1, 256)
     input_tensor, _, sampler, model_runner = _prepare_test(batch_size)
 
@@ -156,8 +162,10 @@ def test_sampler_all_beam(seed: int):
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_sampler_mixed(seed: int):
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_mixed(seed: int, device: str):
     set_random_seed(seed)
+    torch.set_default_device(device)
     batch_size = random.randint(1, 256)
     input_tensor, fake_logits, sampler, model_runner = _prepare_test(
         batch_size)
@@ -212,8 +220,10 @@ def test_sampler_mixed(seed: int):
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_sampler_logits_processors(seed: int):
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_logits_processors(seed: int, device: str):
     set_random_seed(seed)
+    torch.set_default_device(device)
     batch_size = random.randint(1, 256)
     input_tensor, _, sampler, model_runner = _prepare_test(batch_size)
 
@@ -252,14 +262,15 @@ def pick_ith(token_ids, logits):
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_sampler_top_k_top_p(seed: int):
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_top_k_top_p(seed: int, device: str):
     set_random_seed(seed)
     batch_size = random.randint(1, 256)
     top_k = random.randint(100, 500)
     top_p = random.random() * 0.1
     vocab_size = 32000
     input_tensor = torch.rand((batch_size, 1024),
-                              device="cuda",
+                              device=device,
                               dtype=torch.float16)
     fake_logits = torch.normal(0,
                                5,
@@ -267,7 +278,7 @@ def test_sampler_top_k_top_p(seed: int):
                                device=input_tensor.device,
                                dtype=input_tensor.dtype)
     sampler = MockLogitsSampler(32000, fake_logits)
-    model_runner = ModelRunner(None, None, None, None)
+    model_runner = ModelRunner(None, None, None, None, None)
 
     generation_model = GenerationMixin()
     generation_config = GenerationConfig(top_k=top_k,
diff --git a/tests/worker/spec_decode/utils.py b/tests/worker/spec_decode/utils.py
index e0db770046ec8..8d74509fea488 100644
--- a/tests/worker/spec_decode/utils.py
+++ b/tests/worker/spec_decode/utils.py
@@ -84,7 +84,7 @@ def create_worker(cls: type,
     )
 
     (model_config, cache_config, parallel_config, scheduler_config,
-     _) = engine_args.create_engine_configs()
+     device_config, _) = engine_args.create_engine_configs()
 
     distributed_init_method = get_distributed_init_method(
         get_ip(), get_open_port())
@@ -93,6 +93,7 @@ def create_worker(cls: type,
         model_config=model_config,
         parallel_config=parallel_config,
         scheduler_config=scheduler_config,
+        device_config=device_config,
         local_rank=0,
         rank=0,
         distributed_init_method=distributed_init_method,
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index 5d9ad0520de13..f44895a728c7e 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -6,7 +6,7 @@
 
 
 def test_prepare_prompt():
-    model_runner = ModelRunner(None, None, None, None)
+    model_runner = ModelRunner(None, None, None, None, None)
     model_runner.set_block_size(16)
 
     batch_size = random.randint(1, 256)
diff --git a/vllm/config.py b/vllm/config.py
index 4fb7357a3da21..1dfc0d63c8813 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -444,6 +444,12 @@ def _verify_args(self) -> None:
                 f"({self.max_num_seqs}).")
 
 
+class DeviceConfig:
+
+    def __init__(self, device: str = "cuda") -> None:
+        self.device = torch.device(device)
+
+
 @dataclass
 class LoRAConfig:
     max_lora_rank: int
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 231ce3321cdc4..d5e63e25d6e85 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -3,8 +3,8 @@
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
-from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, LoRAConfig)
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, LoRAConfig)
 
 
 @dataclass
@@ -43,6 +43,7 @@ class EngineArgs:
     lora_extra_vocab_size: int = 256
     lora_dtype = 'auto'
     max_cpu_loras: Optional[int] = None
+    device: str = 'cuda'
 
     def __post_init__(self):
         if self.tokenizer is None:
@@ -127,13 +128,13 @@ def add_cli_args(
             '--kv-cache-dtype',
             type=str,
             choices=['auto', 'fp8_e5m2'],
-            default='auto',
+            default=EngineArgs.kv_cache_dtype,
             help='Data type for kv cache storage. If "auto", will use model '
             'data type. Note FP8 is not supported when cuda version is '
             'lower than 11.8.')
         parser.add_argument('--max-model-len',
                             type=int,
-                            default=None,
+                            default=EngineArgs.max_model_len,
                             help='model context length. If unspecified, '
                             'will be automatically derived from the model.')
         # Parallel arguments
@@ -154,6 +155,7 @@ def add_cli_args(
         parser.add_argument(
             '--max-parallel-loading-workers',
             type=int,
+            default=EngineArgs.max_parallel_loading_workers,
             help='load model sequentially in multiple batches, '
             'to avoid RAM OOM when using tensor '
             'parallel and large models')
@@ -200,7 +202,7 @@ def add_cli_args(
                             '-q',
                             type=str,
                             choices=['awq', 'gptq', 'squeezellm', None],
-                            default=None,
+                            default=EngineArgs.quantization,
                             help='Method used to quantize the weights. If '
                             'None, we first check the `quantization_config` '
                             'attribute in the model config file. If that is '
@@ -255,6 +257,13 @@ def add_cli_args(
             help=('Maximum number of LoRAs to store in CPU memory. '
                   'Must be >= than max_num_seqs. '
                   'Defaults to max_num_seqs.'))
+        parser.add_argument(
+            "--device",
+            type=str,
+            default=EngineArgs.device,
+            choices=["cuda"],
+            help=('Device type for vLLM execution. '
+                  'Currently, only CUDA-compatible devices are supported.'))
         return parser
 
     @classmethod
@@ -268,7 +277,8 @@ def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
     def create_engine_configs(
         self,
     ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig,
-               Optional[LoRAConfig]]:
+               DeviceConfig, Optional[LoRAConfig]]:
+        device_config = DeviceConfig(self.device)
         model_config = ModelConfig(self.model, self.tokenizer,
                                    self.tokenizer_mode, self.trust_remote_code,
                                    self.download_dir, self.load_format,
@@ -296,7 +306,8 @@ def create_engine_configs(
             lora_dtype=self.lora_dtype,
             max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
             and self.max_cpu_loras > 0 else None) if self.enable_lora else None
-        return model_config, cache_config, parallel_config, scheduler_config, lora_config
+        return (model_config, cache_config, parallel_config, scheduler_config,
+                device_config, lora_config)
 
 
 @dataclass
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index e60efc5e54e16..92568450a0d59 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -6,8 +6,8 @@
                     Union)
 
 from vllm.lora.request import LoRARequest
-from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, LoRAConfig)
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, LoRAConfig)
 from vllm.core.scheduler import Scheduler, SchedulerOutputs
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics import StatLogger, Stats
@@ -53,6 +53,7 @@ class LLMEngine:
             management.
         parallel_config: The configuration related to distributed execution.
         scheduler_config: The configuration related to the request scheduler.
+        device_config: The configuration related to the device.
         placement_group: Ray placement group for distributed execution.
             Required for distributed execution.
         log_stats: Whether to log statistics.
@@ -64,6 +65,7 @@ def __init__(
         cache_config: CacheConfig,
         parallel_config: ParallelConfig,
         scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
         lora_config: Optional[LoRAConfig],
         placement_group: Optional["PlacementGroup"],
         log_stats: bool,
@@ -85,6 +87,7 @@ def __init__(
             f"quantization={model_config.quantization}, "
             f"enforce_eager={model_config.enforce_eager}, "
             f"kv_cache_dtype={cache_config.cache_dtype}, "
+            f"device_config={device_config.device}, "
             f"seed={model_config.seed})")
         # TODO(woosuk): Print more configs in debug mode.
 
@@ -93,6 +96,7 @@ def __init__(
         self.lora_config = lora_config
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
+        self.device_config = device_config
         self.log_stats = log_stats
         self._verify_args()
 
@@ -138,6 +142,7 @@ def _init_workers(self):
             self.model_config,
             self.parallel_config,
             self.scheduler_config,
+            self.device_config,
             local_rank=0,
             rank=0,
             distributed_init_method=distributed_init_method,
@@ -233,6 +238,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         model_config = copy.deepcopy(self.model_config)
         parallel_config = copy.deepcopy(self.parallel_config)
         scheduler_config = copy.deepcopy(self.scheduler_config)
+        device_config = copy.deepcopy(self.device_config)
 
         for rank, (worker, (node_id,
                             _)) in enumerate(zip(self.workers,
@@ -244,6 +250,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     model_config,
                     parallel_config,
                     scheduler_config,
+                    device_config,
                     local_rank,
                     rank,
                     distributed_init_method,
@@ -257,6 +264,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             model_config,
             parallel_config,
             scheduler_config,
+            device_config,
             driver_local_rank,
             driver_rank,
             distributed_init_method,
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 1af120d13cd4b..95902ae38e256 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -89,9 +89,7 @@ def __init__(
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
         self.scales = nn.Parameter(
-            torch.empty(intermediate_size_per_partition,
-                        dtype=params_dtype,
-                        device="cuda"))
+            torch.empty(intermediate_size_per_partition, dtype=params_dtype))
         set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
index 91ed43f07c76e..2ce9d60f08d80 100644
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -200,7 +200,7 @@ def _make_alibi_bias(
     seq_len: int,
     dtype: torch.dtype,
 ) -> LowerTriangularMaskWithTensorBias:
-    bias = torch.arange(seq_len, dtype=dtype, device="cuda")
+    bias = torch.arange(seq_len, dtype=dtype)
     # NOTE(zhuohan): HF uses
     #     `bias = bias[None, :].repeat(prompt_len, 1)`
     # here. We find that both biases give the same results, but
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 5e1d63a6a62eb..55d38b763b2b5 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -54,7 +54,6 @@ def create_weights(self, input_size_per_partition: int,
                        params_dtype: torch.dtype) -> Dict[str, Any]:
         weight = Parameter(torch.empty(output_size_per_partition,
                                        input_size_per_partition,
-                                       device=torch.cuda.current_device(),
                                        dtype=params_dtype),
                            requires_grad=False)
         set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
@@ -113,9 +112,7 @@ def __init__(
                 self.register_parameter(name, weight)
         if bias:
             self.bias = Parameter(
-                torch.empty(self.output_size,
-                            device=torch.cuda.current_device(),
-                            dtype=self.params_dtype))
+                torch.empty(self.output_size, dtype=self.params_dtype))
             set_weight_attrs(self.bias, {"output_dim": 0})
         else:
             self.register_parameter("bias", None)
@@ -183,7 +180,6 @@ def __init__(
         if bias:
             self.bias = Parameter(
                 torch.empty(self.output_size_per_partition,
-                            device=torch.cuda.current_device(),
                             dtype=params_dtype))
             set_weight_attrs(self.bias, {
                 "output_dim": 0,
@@ -509,9 +505,7 @@ def __init__(
 
         if bias:
             self.bias = Parameter(
-                torch.empty(self.output_size,
-                            device=torch.cuda.current_device(),
-                            dtype=params_dtype))
+                torch.empty(self.output_size, dtype=params_dtype))
             set_weight_attrs(self.bias, {
                 "output_dim": 0,
                 "weight_loader": self.weight_loader,
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 4d3fd3ec0cc71..681f95821eabb 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -96,7 +96,6 @@ def create_weights(self, input_size_per_partition: int,
             torch.empty(
                 input_size_per_partition,
                 output_size_per_partition // self.quant_config.pack_factor,
-                device="cuda",
                 dtype=torch.int32,
             ),
             requires_grad=False,
@@ -112,7 +111,6 @@ def create_weights(self, input_size_per_partition: int,
             torch.empty(
                 input_size_per_partition // self.quant_config.group_size,
                 output_size_per_partition // self.quant_config.pack_factor,
-                device="cuda",
                 dtype=torch.int32,
             ),
             requires_grad=False,
@@ -128,7 +126,6 @@ def create_weights(self, input_size_per_partition: int,
             torch.empty(
                 input_size_per_partition // self.quant_config.group_size,
                 output_size_per_partition,
-                device="cuda",
                 dtype=params_dtype,
             ),
             requires_grad=False,
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 8fe96e7ddb98d..7218760fbe55d 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -127,7 +127,6 @@ def create_weights(
             torch.empty(
                 input_size_per_partition // self.quant_config.pack_factor,
                 output_size_per_partition,
-                device="cuda",
                 dtype=torch.int32,
             ),
             requires_grad=False,
@@ -145,7 +144,6 @@ def create_weights(
                     i // self.quant_config.group_size
                     for i in range(input_size_per_partition)
                 ],
-                device="cuda",
                 dtype=torch.int32,
             ),
             requires_grad=False,
@@ -156,7 +154,6 @@ def create_weights(
             torch.empty(
                 scale_and_zero_size,
                 output_size_per_partition // self.quant_config.pack_factor,
-                device="cuda",
                 dtype=torch.int32,
             ),
             requires_grad=False,
@@ -172,7 +169,6 @@ def create_weights(
             torch.empty(
                 scale_and_zero_size,
                 output_size_per_partition,
-                device="cuda",
                 dtype=params_dtype,
             ),
             requires_grad=False,
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index 1932bd145076b..9244e88552756 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -80,7 +80,6 @@ def create_weights(self, input_size_per_partition: int,
             torch.empty(
                 input_size_per_partition // self.quant_config.pack_factor,
                 output_size_per_partition,
-                device="cuda",
                 dtype=torch.int32,
             ),
             requires_grad=False,
@@ -96,7 +95,6 @@ def create_weights(self, input_size_per_partition: int,
             torch.empty(
                 output_size,
                 self.quant_config.weight_bits**2,
-                device="cuda",
                 dtype=params_dtype,
             ),
             requires_grad=False,
@@ -118,12 +116,12 @@ def apply_weights(self,
         out_shape = x.shape[:-1] + (qweight.shape[-1], )
         reshaped_x = x.reshape(-1, x.shape[-1])
         if is_hip():
-            out_f = torch.zeros(out_shape, device="cuda", dtype=torch.float)
+            out_f = torch.zeros(out_shape, dtype=torch.float)
             ops.squeezellm_gemm(reshaped_x, qweight, out_f, lookup_table)
             out = out_f.to(dtype=torch.float16)
         else:
             # NOTE: The output tensor should be zero-initialized.
-            out = torch.zeros(out_shape, device="cuda", dtype=torch.float16)
+            out = torch.zeros(out_shape, dtype=torch.float16)
             ops.squeezellm_gemm(reshaped_x, qweight, out, lookup_table)
 
         if bias is not None:
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 91c093e33e3c9..93ec5c12536fb 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -77,16 +77,13 @@ def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
         # create the cache on GPU for faster initialization. This may cause
         # a slight numerical difference between the HF implementation and ours.
         inv_freq = 1.0 / (base**(torch.arange(
-            0, self.rotary_dim, 2, dtype=torch.float, device="cuda") /
-                                 self.rotary_dim))
+            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
         return inv_freq
 
     def _compute_cos_sin_cache(self) -> torch.Tensor:
         """Compute the cos and sin cache."""
         inv_freq = self._compute_inv_freq(self.base)
-        t = torch.arange(self.max_position_embeddings,
-                         dtype=torch.float,
-                         device="cuda")
+        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
 
         freqs = torch.einsum("i,j -> ij", t, inv_freq)
         cos = freqs.cos()
@@ -174,7 +171,7 @@ def _compute_cos_sin_cache(self) -> torch.Tensor:
         # Thus, the maximum length after applying the rope scaling is
         # self.max_position_embeddings * self.scaling_factor.
         max_len = self.max_position_embeddings * self.scaling_factor
-        t = torch.arange(max_len, dtype=torch.float, device="cuda")
+        t = torch.arange(max_len, dtype=torch.float)
         t = t / self.scaling_factor
 
         freqs = torch.einsum("i,j -> ij", t, inv_freq)
@@ -214,7 +211,7 @@ def _compute_cos_sin_cache(self) -> torch.Tensor:
             (self.scaling_factor - 1))**(self.rotary_dim /
                                          (self.rotary_dim - 2))
         inv_freq = self._compute_inv_freq(base)
-        t = torch.arange(max_len, dtype=torch.float, device="cuda")
+        t = torch.arange(max_len, dtype=torch.float)
 
         freqs = torch.einsum("i,j -> ij", t, inv_freq)
         cos = freqs.cos()
@@ -297,9 +294,9 @@ def __init__(
                          is_neox_style)
 
     def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
-        pos_freqs = self.base**(torch.arange(
-            0, self.rotary_dim, 2, dtype=torch.float, device="cuda") /
-                                self.rotary_dim)
+        pos_freqs = self.base**(
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float) /
+            self.rotary_dim)
         inv_freq_extrapolation = 1.0 / pos_freqs
         inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
 
@@ -308,8 +305,8 @@ def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
                                                 self.max_position_embeddings)
         # Get n-d rotational scaling corrected for extrapolation
         inv_freq_mask = (1 - _yarn_linear_ramp_mask(
-            low, high, self.rotary_dim // 2, dtype=torch.float,
-            device="cuda")) * self.extrapolation_factor
+            low, high, self.rotary_dim // 2,
+            dtype=torch.float)) * self.extrapolation_factor
         inv_freq = inv_freq_interpolation * (
             1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
         return inv_freq
@@ -317,7 +314,6 @@ def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
     def _compute_cos_sin_cache(self) -> torch.Tensor:
         inv_freq = self._compute_inv_freq(self.scaling_factor)
         t = torch.arange(self.max_position_embeddings * self.scaling_factor,
-                         device="cuda",
                          dtype=torch.float32)
         freqs = torch.einsum("i,j -> ij", t, inv_freq)
         cos = (freqs.cos() * self.mscale)
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 9c5fb890251ed..6d13cf818cbfe 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -77,7 +77,6 @@ def __init__(self,
         self.weight = Parameter(
             torch.empty(self.num_embeddings_per_partition,
                         self.embedding_dim,
-                        device=torch.cuda.current_device(),
                         dtype=params_dtype))
         set_weight_attrs(self.weight, {
             "parallel_dim": 0,
@@ -139,7 +138,6 @@ def __init__(self,
         if bias:
             self.bias = Parameter(
                 torch.empty(self.num_embeddings_per_partition,
-                            device=torch.cuda.current_device(),
                             dtype=params_dtype))
             set_weight_attrs(self.bias, {
                 "parallel_dim": 0,
diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
index cd21c7788fc7d..4b1e13d9e9e0a 100644
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn as nn
 
-from vllm.config import ModelConfig, LoRAConfig
+from vllm.config import DeviceConfig, ModelConfig, LoRAConfig
 from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.weight_utils import (get_quant_config,
                                               initialize_dummy_weights)
@@ -38,6 +38,7 @@ def _get_model_architecture(model_config: ModelConfig) -> Type[nn.Module]:
 
 
 def get_model(model_config: ModelConfig,
+              device_config: DeviceConfig,
               lora_config: Optional[LoRAConfig] = None) -> nn.Module:
     model_class = _get_model_architecture(model_config)
 
@@ -64,7 +65,7 @@ def get_model(model_config: ModelConfig,
     with _set_default_torch_dtype(model_config.dtype):
         # Create a model instance.
         # The weights will be initialized as empty tensors.
-        with torch.device("cuda"):
+        with torch.device(device_config.device):
             if getattr(model_class, "supports_lora", False):
                 model = model_class(model_config.hf_config, linear_method,
                                     lora_config)
diff --git a/vllm/utils.py b/vllm/utils.py
index dc81741498356..9e9126a2d6377 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -228,7 +228,8 @@ def create_kv_caches_with_random(
     device: Optional[str] = "cuda",
 ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
     torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
 
     if isinstance(cache_dtype, str):
         if cache_dtype == "auto":
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index f57e1ed75803d..bbe33989fc2a4 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -104,11 +104,13 @@ def allocate_cpu_cache(self) -> List[KVCache]:
                 size=(self.num_cpu_blocks, *key_block_shape),
                 dtype=self.dtype,
                 pin_memory=pin_memory,
+                device="cpu",
             )
             value_blocks = torch.empty(
                 size=(self.num_cpu_blocks, *value_block_shape),
                 dtype=self.dtype,
                 pin_memory=pin_memory,
+                device="cpu",
             )
             cpu_cache.append((key_blocks, value_blocks))
         return cpu_cache
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 2df9fd5215a2d..fce0009e3097d 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn as nn
 
-from vllm.config import ModelConfig, LoRAConfig, ParallelConfig, SchedulerConfig
+from vllm.config import DeviceConfig, ModelConfig, LoRAConfig, ParallelConfig, SchedulerConfig
 from vllm.logger import init_logger
 from vllm.model_executor import get_model, InputMetadata, SamplingMetadata
 from vllm.model_executor.parallel_utils.communication_op import (
@@ -35,6 +35,7 @@ def __init__(
         model_config: ModelConfig,
         parallel_config: ParallelConfig,
         scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
@@ -49,7 +50,10 @@ def __init__(
         # FIXME(woosuk): This is a hack to make the tests work. Refactor this.
         self.sliding_window = (model_config.get_sliding_window()
                                if model_config is not None else None)
-        self.device = torch.device(torch.cuda.current_device())
+        self.device_config = (device_config
+                              if device_config is not None else DeviceConfig())
+        self.device = self.device_config.device
+
         self.model = None
         self.block_size = None  # Set after initial profiling.
         self.lora_manager = None
@@ -72,7 +76,8 @@ def __init__(
         self.kv_cache_dtype = kv_cache_dtype
 
     def load_model(self) -> None:
-        self.model = get_model(self.model_config, self.lora_config)
+        self.model = get_model(self.model_config, self.device_config,
+                               self.lora_config)
 
         vocab_size = self.model.config.vocab_size
 
@@ -182,22 +187,25 @@ def _prepare_prompt(
         input_tokens = _make_tensor_with_pad(input_tokens,
                                              max_prompt_len,
                                              pad=0,
-                                             dtype=torch.long)
+                                             dtype=torch.long,
+                                             device=self.device)
         input_positions = _make_tensor_with_pad(input_positions,
                                                 max_prompt_len,
                                                 pad=0,
-                                                dtype=torch.long)
+                                                dtype=torch.long,
+                                                device=self.device)
         slot_mapping = _make_tensor_with_pad(slot_mapping,
                                              max_prompt_len,
                                              pad=_PAD_SLOT_ID,
-                                             dtype=torch.long)
+                                             dtype=torch.long,
+                                             device=self.device)
         lora_index_mapping = [
             _pad_to_max(mapping, max_prompt_len, pad=0)
             for mapping in lora_index_mapping
         ]
         context_lens_tensor = torch.tensor(context_lens,
                                            dtype=torch.int,
-                                           device='cuda')
+                                           device=self.device)
         # Prepare prefix block tables
         max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
         block_tables = _make_tensor_with_pad(
@@ -205,15 +213,16 @@ def _prepare_prompt(
             max_len=max_prompt_block_table_len,
             pad=0,
             dtype=torch.int,
+            device=self.device,
         )
         start_loc_tensor = torch.arange(0,
                                         len(prompt_lens) * max_prompt_len,
                                         max_prompt_len,
                                         dtype=torch.long,
-                                        device='cuda')
+                                        device=self.device)
         prompt_lens_tensor = torch.tensor(prompt_lens,
                                           dtype=torch.long,
-                                          device='cuda')
+                                          device=self.device)
 
         input_metadata = InputMetadata(
             is_prompt=True,
@@ -305,20 +314,20 @@ def _prepare_decode(
                                              max_len=1,
                                              pad=0,
                                              dtype=torch.long,
-                                             device="cuda")
+                                             device=self.device)
         input_positions = _make_tensor_with_pad(input_positions,
                                                 max_len=1,
                                                 pad=0,
                                                 dtype=torch.long,
-                                                device="cuda")
+                                                device=self.device)
         slot_mapping = _make_tensor_with_pad(slot_mapping,
                                              max_len=1,
                                              pad=_PAD_SLOT_ID,
                                              dtype=torch.long,
-                                             device="cuda")
+                                             device=self.device)
         context_lens = torch.tensor(context_lens,
                                     dtype=torch.int,
-                                    device="cuda")
+                                    device=self.device)
 
         if use_captured_graph:
             # The shape of graph_block_tables is
@@ -327,7 +336,7 @@ def _prepare_decode(
             for i, block_table in enumerate(block_tables):
                 if block_table:
                     input_block_tables[i, :len(block_table)] = block_table
-            block_tables = torch.tensor(input_block_tables, device="cuda")
+            block_tables = torch.tensor(input_block_tables, device=self.device)
         else:
             max_block_table_len = max(
                 len(block_table) for block_table in block_tables)
@@ -336,7 +345,7 @@ def _prepare_decode(
                 max_len=max_block_table_len,
                 pad=0,
                 dtype=torch.int,
-                device="cuda",
+                device=self.device,
             )
 
         lora_index_mapping = [
@@ -355,7 +364,8 @@ def _prepare_decode(
             use_cuda_graph=use_captured_graph,
             kv_cache_dtype=self.kv_cache_dtype,
         )
-        return input_tokens, input_positions, input_metadata, lora_index_mapping, lora_prompt_mapping, lora_requests
+        return (input_tokens, input_positions, input_metadata,
+                lora_index_mapping, lora_prompt_mapping, lora_requests)
 
     def _prepare_sample(
         self,
@@ -410,9 +420,13 @@ def _prepare_sample(
 
         selected_token_indices = _async_h2d(selected_token_indices,
                                             dtype=torch.long,
+                                            target_device=self.device,
                                             pin_memory=not self.in_wsl)
         categorized_sample_indices = {
-            t: _async_h2d(seq_ids, dtype=torch.int, pin_memory=not self.in_wsl)
+            t: _async_h2d(seq_ids,
+                          dtype=torch.int,
+                          target_device=self.device,
+                          pin_memory=not self.in_wsl)
             for t, seq_ids in categorized_sample_indices.items()
         }
 
@@ -511,7 +525,8 @@ def prepare_input_tensors(
                 perform_sampling=False,
             )
 
-        return input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests, lora_mapping
+        return (input_tokens, input_positions, input_metadata,
+                sampling_metadata, lora_requests, lora_mapping)
 
     @torch.inference_mode()
     def execute_model(
@@ -519,8 +534,9 @@ def execute_model(
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
         kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
     ) -> Optional[SamplerOutput]:
-        input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests, lora_mapping = (
-            self.prepare_input_tensors(seq_group_metadata_list))
+        (input_tokens, input_positions, input_metadata, sampling_metadata,
+         lora_requests,
+         lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list)
 
         if self.lora_config:
             self.set_active_loras(lora_requests, lora_mapping)
@@ -789,14 +805,10 @@ def _make_tensor_with_pad(
     max_len: int,
     pad: int,
     dtype: torch.dtype,
-    device: Union[str, torch.device] = "cuda",
-    pin_memory: bool = False,
+    device: Optional[Union[str, torch.device]],
 ) -> torch.Tensor:
     padded_x = [_pad_to_max(x_i, max_len, pad) for x_i in x]
-    return torch.tensor(padded_x,
-                        dtype=dtype,
-                        device=device,
-                        pin_memory=pin_memory and str(device) == "cpu")
+    return torch.tensor(padded_x, dtype=dtype, device=device)
 
 
 def _get_graph_batch_size(batch_size: int) -> int:
@@ -808,6 +820,11 @@ def _get_graph_batch_size(batch_size: int) -> int:
         return (batch_size + 7) // 8 * 8
 
 
-def _async_h2d(data: list, dtype, pin_memory):
-    t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory)
-    return t.to(device="cuda", non_blocking=True)
+def _async_h2d(
+    data: list,
+    dtype: torch.dtype,
+    target_device: Union[str, torch.device],
+    pin_memory: bool,
+) -> torch.Tensor:
+    t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu")
+    return t.to(device=target_device, non_blocking=True)
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index a74adfa585611..c97e82a55a1ee 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -6,8 +6,8 @@
 import torch
 import torch.distributed
 
-from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, LoRAConfig)
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, LoRAConfig)
 from vllm.model_executor import set_random_seed
 from vllm.model_executor.parallel_utils.communication_op import (
     broadcast_tensor_dict)
@@ -33,6 +33,7 @@ def __init__(
         model_config: ModelConfig,
         parallel_config: ParallelConfig,
         scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
@@ -43,6 +44,7 @@ def __init__(
         self.model_config = model_config
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
+        self.device_config = device_config
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
@@ -54,6 +56,7 @@ def __init__(
         self.model_runner = ModelRunner(model_config,
                                         parallel_config,
                                         scheduler_config,
+                                        device_config,
                                         lora_config=self.lora_config,
                                         kv_cache_dtype=kv_cache_dtype,
                                         is_driver_worker=is_driver_worker)
@@ -65,21 +68,24 @@ def __init__(
         self.gpu_cache = None
 
     def init_model(self) -> None:
-        # torch.distributed.all_reduce does not free the input tensor until
-        # the synchronization point. This causes the memory usage to grow
-        # as the number of all_reduce calls increases. This env var disables
-        # this behavior.
-        # Related issue:
-        # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
-        os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
-
-        # This env var set by Ray causes exceptions with graph building.
-        os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
-        self.device = torch.device(f"cuda:{self.local_rank}")
-        torch.cuda.set_device(self.device)
-
-        _check_if_gpu_supports_dtype(self.model_config.dtype)
-
+        if self.device_config.device.type == "cuda":
+            # torch.distributed.all_reduce does not free the input tensor until
+            # the synchronization point. This causes the memory usage to grow
+            # as the number of all_reduce calls increases. This env var disables
+            # this behavior.
+            # Related issue:
+            # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+            os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+            # This env var set by Ray causes exceptions with graph building.
+            os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
+            self.device = torch.device(f"cuda:{self.local_rank}")
+            torch.cuda.set_device(self.device)
+
+            _check_if_gpu_supports_dtype(self.model_config.dtype)
+        else:
+            raise RuntimeError(
+                f"Not support device type: {self.device_config.device}")
         # Initialize the distributed environment.
         init_distributed_environment(self.parallel_config, self.rank,
                                      self.distributed_init_method)

From 0e163fce18594c7e29dc5a143dd6b33d213fcbf3 Mon Sep 17 00:00:00 2001
From: zspo <songpo.zhang@foxmail.com>
Date: Fri, 2 Feb 2024 07:59:39 +0800
Subject: [PATCH 034/112] Fix default length_penalty to 1.0 (#2667)

---
 vllm/sequence.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/sequence.py b/vllm/sequence.py
index 7b1c9a77a1e02..9669562cfeac5 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -196,7 +196,7 @@ def get_cumulative_logprob(self) -> float:
         return self.data.cumulative_logprob
 
     def get_beam_search_score(self,
-                              length_penalty: float = 0.0,
+                              length_penalty: float = 1.0,
                               seq_len: Optional[int] = None,
                               eos_token_id: Optional[int] = None) -> float:
         """Calculate the beam search score with length penalty.

From 4abf6336ec65c270343eb895e7b18786e9274176 Mon Sep 17 00:00:00 2001
From: Cheng Su <chengsu@anyscale.com>
Date: Fri, 2 Feb 2024 15:41:42 -0800
Subject: [PATCH 035/112] Add one example to run batch inference distributed on
 Ray (#2696)

---
 examples/offline_inference_distributed.py | 70 +++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 examples/offline_inference_distributed.py

diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference_distributed.py
new file mode 100644
index 0000000000000..0897045fd94ae
--- /dev/null
+++ b/examples/offline_inference_distributed.py
@@ -0,0 +1,70 @@
+"""
+This example shows how to use Ray Data for running offline batch inference
+distributively on a multi-nodes cluster.
+
+Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
+"""
+
+from vllm import LLM, SamplingParams
+from typing import Dict
+import numpy as np
+import ray
+
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+# Create a class to do batch inference.
+class LLMPredictor:
+
+    def __init__(self):
+        # Create an LLM.
+        self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")
+
+    def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
+        # Generate texts from the prompts.
+        # The output is a list of RequestOutput objects that contain the prompt,
+        # generated text, and other information.
+        outputs = self.llm.generate(batch["text"], sampling_params)
+        prompt = []
+        generated_text = []
+        for output in outputs:
+            prompt.append(output.prompt)
+            generated_text.append(' '.join([o.text for o in output.outputs]))
+        return {
+            "prompt": prompt,
+            "generated_text": generated_text,
+        }
+
+
+# Read one text file from S3. Ray Data supports reading multiple files
+# from cloud storage (such as JSONL, Parquet, CSV, binary format).
+ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
+
+# Apply batch inference for all input data.
+ds = ds.map_batches(
+    LLMPredictor,
+    # Set the concurrency to the number of LLM instances.
+    concurrency=10,
+    # Specify the number of GPUs required per LLM instance.
+    # NOTE: Do NOT set `num_gpus` when using vLLM with tensor-parallelism
+    # (i.e., `tensor_parallel_size`).
+    num_gpus=1,
+    # Specify the batch size for inference.
+    batch_size=32,
+)
+
+# Peek first 10 results.
+# NOTE: This is for local testing and debugging. For production use case,
+# one should write full result out as shown below.
+outputs = ds.take(limit=10)
+for output in outputs:
+    prompt = output["prompt"]
+    generated_text = output["generated_text"]
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+# Write inference output data out as Parquet files to S3.
+# Multiple files would be written to the output destination,
+# and each task would write one or more files separately.
+#
+# ds.write_parquet("s3://<your-output-bucket>")

From 5ed704ec8c4e68f1bc846ab4e3c9e355585d62da Mon Sep 17 00:00:00 2001
From: Massimiliano Pronesti <massimiliano.pronesti@gmail.com>
Date: Sun, 4 Feb 2024 03:17:55 +0100
Subject: [PATCH 036/112] docs: fix langchain (#2736)

---
 docs/source/serving/serving_with_langchain.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/serving/serving_with_langchain.rst b/docs/source/serving/serving_with_langchain.rst
index 2e1ce688290ad..6440c8aad5986 100644
--- a/docs/source/serving/serving_with_langchain.rst
+++ b/docs/source/serving/serving_with_langchain.rst
@@ -9,13 +9,13 @@ To install langchain, run
 
 .. code-block:: console
 
-    $ pip install langchain -q
+    $ pip install langchain langchain_community -q
 
 To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``.
 
 .. code-block:: python
 
-    from langchain.llms import VLLM
+    from langchain_community.llms import VLLM
 
     llm = VLLM(model="mosaicml/mpt-7b",
                trust_remote_code=True,  # mandatory for hf models
@@ -28,4 +28,4 @@ To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langcha
 
     print(llm("What is the capital of France ?"))
 
-Please refer to this `Tutorial <https://github.com/langchain-ai/langchain/blob/master/docs/docs/integrations/llms/vllm.ipynb>`_ for more details.
\ No newline at end of file
+Please refer to this `Tutorial <https://python.langchain.com/docs/integrations/llms/vllm>`_ for more details.

From 51cd22ce56b93e74cca22eaca286ff4770e8157c Mon Sep 17 00:00:00 2001
From: dancingpipi <xxdyx110@126.com>
Date: Mon, 5 Feb 2024 06:25:36 +0800
Subject: [PATCH 037/112] set&get llm internal tokenizer instead of the
 TokenizerGroup (#2741)

Co-authored-by: shujunhua1 <shujunhua1@jd.com>
---
 vllm/entrypoints/llm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 614e6fa520c8c..fc82018d18eb6 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -111,13 +111,13 @@ def __init__(
 
     def get_tokenizer(
             self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
-        return self.llm_engine.tokenizer
+        return self.llm_engine.tokenizer.tokenizer
 
     def set_tokenizer(
         self,
         tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     ) -> None:
-        self.llm_engine.tokenizer = tokenizer
+        self.llm_engine.tokenizer.tokenizer = tokenizer
 
     def generate(
         self,

From 5a6c81b0511da333b1fabf5ad612eb7874d5e88e Mon Sep 17 00:00:00 2001
From: Rex <zcnrex@gmail.com>
Date: Sun, 4 Feb 2024 14:32:42 -0800
Subject: [PATCH 038/112] Remove eos tokens from output by default (#2611)

---
 vllm/engine/llm_engine.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 92568450a0d59..02c673c96fd9a 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -908,13 +908,13 @@ def _check_stop(self, seq: Sequence,
         """Stop the finished sequences."""
         for stop_str in sampling_params.stop:
             if seq.output_text.endswith(stop_str):
-                if not sampling_params.include_stop_str_in_output:
-                    # Truncate the output text so that the stop string is
-                    # not included in the output.
-                    seq.output_text = seq.output_text[:-len(stop_str)]
+                self._finalize_sequence(seq, sampling_params, stop_str)
                 seq.status = SequenceStatus.FINISHED_STOPPED
                 return
         if seq.get_last_token_id() in sampling_params.stop_token_ids:
+            stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens(
+                seq.get_last_token_id())
+            self._finalize_sequence(seq, sampling_params, stop_str)
             seq.status = SequenceStatus.FINISHED_STOPPED
             return
 
@@ -934,6 +934,14 @@ def _check_stop(self, seq: Sequence,
             seq.status = SequenceStatus.FINISHED_STOPPED
             return
 
+    def _finalize_sequence(self, seq: Sequence,
+                           sampling_params: SamplingParams,
+                           stop_string: str) -> None:
+        if not sampling_params.include_stop_str_in_output and stop_string:
+            # Truncate the output text so that the stop string is
+            # not included in the output.
+            seq.output_text = seq.output_text[:-len(stop_string)]
+
     def add_lora(self, lora_request: LoRARequest) -> bool:
         assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
         return self._run_workers(

From c9b45adeeb0e5b2f597d1687e0b8f24167602395 Mon Sep 17 00:00:00 2001
From: whyiug <whyiug@hotmail.com>
Date: Mon, 5 Feb 2024 15:07:36 +0800
Subject: [PATCH 039/112] Require triton >= 2.1.0 (#2746)

Co-authored-by: yangrui1 <yangrui@lanjingren.com>
---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 2bf527ccc3a77..5684b2c29634d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,4 @@ uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
 aioprometheus[starlette]
 pynvml == 11.5.0
+triton >= 2.1.0

From 72d3a30c6327e70de3595d00f04e2d577fcbbb68 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 5 Feb 2024 12:45:37 -0800
Subject: [PATCH 040/112] [Minor] Fix benchmark_latency script (#2765)

---
 benchmarks/benchmark_latency.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 2eb9e2cb8e4d5..6e3b679cb81b2 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -37,7 +37,10 @@ def main(args: argparse.Namespace):
         max_tokens=args.output_len,
     )
     print(sampling_params)
-    dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
 
     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
@@ -71,7 +74,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
                 "."
             ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
         print(f"Profiling (results will be saved to '{profile_dir}')...")
-        run_to_completion(profile_dir=args.profile_result_dir)
+        run_to_completion(profile_dir=profile_dir)
         return
 
     # Benchmark.

From 56f738ae9b631189e67795b397258afbed59b042 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Mon, 5 Feb 2024 17:25:36 -0500
Subject: [PATCH 041/112] [ROCm] Fix some kernels failed unit tests (#2498)

---
 tests/kernels/allclose_default.py  | 18 ++++++++++++++++++
 tests/kernels/test_activation.py   | 16 +++++++++++++---
 tests/kernels/test_attention.py    | 22 +++++++++++++++++-----
 tests/kernels/test_cache.py        |  6 +++++-
 tests/kernels/test_pos_encoding.py | 12 +++++++++---
 5 files changed, 62 insertions(+), 12 deletions(-)
 create mode 100644 tests/kernels/allclose_default.py

diff --git a/tests/kernels/allclose_default.py b/tests/kernels/allclose_default.py
new file mode 100644
index 0000000000000..175cfe82fb74e
--- /dev/null
+++ b/tests/kernels/allclose_default.py
@@ -0,0 +1,18 @@
+import torch
+
+# Reference default values of atol and rtol are from
+# https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
+default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
+default_rtol = {
+    torch.float16: 1e-3,
+    torch.bfloat16: 1.6e-2,
+    torch.float: 1.3e-6
+}
+
+
+def get_default_atol(output) -> float:
+    return default_atol[output.dtype]
+
+
+def get_default_rtol(output) -> float:
+    return default_rtol[output.dtype]
diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index de0b497057269..8e216c293f070 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -2,6 +2,7 @@
 import torch
 
 from vllm.model_executor.layers.activation import FastGELU, NewGELU, SiluAndMul
+from allclose_default import get_default_atol, get_default_rtol
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
@@ -33,7 +34,10 @@ def test_silu_and_mul(
     layer = SiluAndMul()
     out = layer(x)
     ref_out = layer._forward(x)
-    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
+    assert torch.allclose(out,
+                          ref_out,
+                          atol=get_default_atol(out),
+                          rtol=get_default_rtol(out))
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -57,7 +61,10 @@ def test_gelu_new(
     layer = NewGELU()
     out = layer(x)
     ref_out = layer._forward(x)
-    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
+    assert torch.allclose(out,
+                          ref_out,
+                          atol=get_default_atol(out),
+                          rtol=get_default_rtol(out))
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -80,4 +87,7 @@ def test_gelu_fast(
     layer = FastGELU()
     out = layer(x)
     ref_out = layer._forward(x)
-    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
+    assert torch.allclose(out,
+                          ref_out,
+                          atol=get_default_atol(out),
+                          rtol=get_default_rtol(out))
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 92d63eb6c63e2..fb571de63d4e1 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -8,6 +8,8 @@
 
 from vllm._C import ops, cache_ops
 from vllm.utils import get_max_shared_memory_bytes
+from vllm.utils import is_hip
+from allclose_default import get_default_atol, get_default_rtol
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
@@ -17,12 +19,18 @@
 # Reduce NUM_BLOCKS when it happens.
 NUM_BLOCKS = 4321  # Arbitrary values for testing
 PARTITION_SIZE = 512
-
-DTYPES = [torch.half, torch.bfloat16, torch.float]
+# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
+DTYPES = [torch.half, torch.bfloat16, torch.float
+          ] if not is_hip() else [torch.half, torch.bfloat16]
 NUM_GEN_SEQS = [7]  # Arbitrary values for testing
 NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
 NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
-HEAD_SIZES = [64, 80, 96, 112, 128, 256]
+
+# FlashAttention forward only supports head dimension at most 128
+# https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
+HEAD_SIZES = [64, 80, 96, 112, 128, 256
+              ] if not is_hip() else [64, 80, 96, 112, 128]
+
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
 KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
@@ -251,9 +259,11 @@ def test_paged_attention(
     # NOTE(woosuk): Due to the kernel-level differences in the two
     # implementations, there is a small numerical difference in the two
     # outputs. Thus, we use a relaxed tolerance for the test.
+    atol = get_default_atol(output) if is_hip() else 1e-3
+    rtol = get_default_rtol(output) if is_hip() else 1e-5
+
     # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
     # so we use a relaxed tolerance for the test.
-    atol, rtol = 1e-3, 1e-5
     if kv_cache_dtype == "fp8_e5m2":
         atol, rtol = 1e-2, 1e-5
     assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
@@ -357,4 +367,6 @@ def test_multi_query_kv_attention(
         scale,
         dtype,
     )
-    assert torch.allclose(output, ref_output, atol=1e-3, rtol=1e-5)
+    atol = get_default_atol(output) if is_hip() else 1e-3
+    rtol = get_default_rtol(output) if is_hip() else 1e-5
+    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index a90492f53eee6..e0368d926d51a 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -6,6 +6,7 @@
 from typing import Tuple
 
 from vllm._C import cache_ops
+from vllm.utils import is_hip
 
 COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -14,7 +15,10 @@
 NUM_HEADS = [8]  # Arbitrary values for testing
 HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 BLOCK_SIZES = [8, 16, 32]
-NUM_BLOCKS = [1024, 3600]  # Arbitrary values for testing
+# reduce the size for ROCm test to avoid HIP OOM
+NUM_BLOCKS = [1024, 36000] if not is_hip else [
+    1024, 10000
+]  # Arbitrary values for testing
 NUM_MAPPINGS = [256]  # Arbitrary values for testing
 SEEDS = [0]
 CUDA_DEVICES = [
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index 19cbd600e838f..0d27bbaff9fc5 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -2,7 +2,7 @@
 
 import pytest
 import torch
-
+from allclose_default import get_default_atol, get_default_rtol
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
 IS_NEOX_STYLE = [True, False]
@@ -64,5 +64,11 @@ def test_rotary_embedding(
     ref_query, ref_key = rope._forward(positions, query, key)
     out_query, out_key = rope.forward(positions, query, key)
     # Compare the results.
-    assert torch.allclose(out_query, ref_query, atol=1e-5, rtol=1e-5)
-    assert torch.allclose(out_key, ref_key, atol=1e-5, rtol=1e-5)
+    assert torch.allclose(out_query,
+                          ref_query,
+                          atol=get_default_atol(out_query),
+                          rtol=get_default_rtol(out_query))
+    assert torch.allclose(out_key,
+                          ref_key,
+                          atol=get_default_atol(out_key),
+                          rtol=get_default_rtol(out_key))

From b92adec8e88f5b69384189faae17b48b5980cba3 Mon Sep 17 00:00:00 2001
From: Lukas <52111220+gardberg@users.noreply.github.com>
Date: Mon, 5 Feb 2024 23:26:50 +0100
Subject: [PATCH 042/112] Set local logging level via env variable (#2774)

---
 vllm/logger.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/logger.py b/vllm/logger.py
index 24d4f0ec1ae0a..530494ae66925 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -3,6 +3,7 @@
 """Logging configuration for vLLM."""
 import logging
 import sys
+import os
 
 _FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
 _DATE_FORMAT = "%m-%d %H:%M:%S"
@@ -50,7 +51,7 @@ def _setup_logger():
 def init_logger(name: str):
     # Use the same settings as above for root logger
     logger = logging.getLogger(name)
-    logger.setLevel(logging.DEBUG)
+    logger.setLevel(os.getenv("LOG_LEVEL", "DEBUG"))
     logger.addHandler(_default_handler)
     logger.propagate = False
     return logger

From 2ccee3def6d8532afaf6fcd351b228ec1dfd6013 Mon Sep 17 00:00:00 2001
From: Douglas Lehr <91553416+dllehr-amd@users.noreply.github.com>
Date: Mon, 5 Feb 2024 16:59:09 -0600
Subject: [PATCH 043/112] [ROCm] Fixup arch checks for ROCM (#2627)

---
 Dockerfile.rocm |  3 --
 setup.py        | 90 ++++++++++++++++++++++++++++++-------------------
 2 files changed, 56 insertions(+), 37 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 88172fb73b937..3c76305303037 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -10,9 +10,6 @@ RUN echo "Base image is $BASE_IMAGE"
 # BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
 # BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
 
-# this does not always work for all rocm versions
-RUN LLVM_GFX_ARCH=$(/opt/rocm/llvm/bin/amdgpu-offload-arch) && \
-    echo "LLVM_GFX_ARCH is $LLVM_GFX_ARCH"
 
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
 RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
diff --git a/setup.py b/setup.py
index 3e2127855a755..0c4937da210ef 100644
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@
 
 # Supported NVIDIA GPU architectures.
 NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
-ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"}
+ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx942"}
 # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
 
 
@@ -63,22 +63,6 @@ def _is_cuda() -> bool:
 NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
 
 
-def get_amdgpu_offload_arch():
-    command = "/opt/rocm/llvm/bin/amdgpu-offload-arch"
-    try:
-        output = subprocess.check_output([command])
-        return output.decode('utf-8').strip()
-    except subprocess.CalledProcessError as e:
-        error_message = f"Error: {e}"
-        raise RuntimeError(error_message) from e
-    except FileNotFoundError as e:
-        # If the command is not found, print an error message
-        error_message = f"The command {command} was not found."
-        raise RuntimeError(error_message) from e
-
-    return None
-
-
 def get_hipcc_rocm_version():
     # Run the hipcc --version command
     result = subprocess.run(['hipcc', '--version'],
@@ -138,6 +122,50 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version:
     return nvcc_cuda_version
 
 
+def get_pytorch_rocm_arch() -> Set[str]:
+    """Get the cross section of Pytorch,and vllm supported gfx arches
+
+    ROCM can get the supported gfx architectures in one of two ways
+    Either through the PYTORCH_ROCM_ARCH env var, or output from
+    rocm_agent_enumerator.
+
+    In either case we can generate a list of supported arch's and
+    cross reference with VLLM's own ROCM_SUPPORTED_ARCHs.
+    """
+    env_arch_list = os.environ.get("PYTORCH_ROCM_ARCH", None)
+
+    # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator
+    if env_arch_list is None:
+        command = "rocm_agent_enumerator"
+        env_arch_list = subprocess.check_output([command]).decode('utf-8')\
+                        .strip().replace("\n", ";")
+        arch_source_str = "rocm_agent_enumerator"
+    else:
+        arch_source_str = "PYTORCH_ROCM_ARCH env variable"
+
+    # List are separated by ; or space.
+    pytorch_rocm_arch = set(env_arch_list.replace(" ", ";").split(";"))
+
+    # Filter out the invalid architectures and print a warning.
+    arch_list = pytorch_rocm_arch.intersection(ROCM_SUPPORTED_ARCHS)
+
+    # If none of the specified architectures are valid, raise an error.
+    if not arch_list:
+        raise RuntimeError(
+            f"None of the ROCM architectures in {arch_source_str} "
+            f"({env_arch_list}) is supported. "
+            f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}.")
+    invalid_arch_list = pytorch_rocm_arch - ROCM_SUPPORTED_ARCHS
+    if invalid_arch_list:
+        warnings.warn(
+            f"Unsupported ROCM architectures ({invalid_arch_list}) are "
+            f"excluded from the {arch_source_str} output "
+            f"({env_arch_list}). Supported ROCM architectures are: "
+            f"{ROCM_SUPPORTED_ARCHS}.",
+            stacklevel=2)
+    return arch_list
+
+
 def get_torch_arch_list() -> Set[str]:
     # TORCH_CUDA_ARCH_LIST can have one or more architectures,
     # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
@@ -162,22 +190,27 @@ def get_torch_arch_list() -> Set[str]:
     # If none of the specified architectures are valid, raise an error.
     if not arch_list:
         raise RuntimeError(
-            "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env "
+            "None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env "
             f"variable ({env_arch_list}) is supported. "
-            f"Supported CUDA/ROCM architectures are: {valid_archs}.")
+            f"Supported CUDA architectures are: {valid_archs}.")
     invalid_arch_list = torch_arch_list - valid_archs
     if invalid_arch_list:
         warnings.warn(
-            f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are "
+            f"Unsupported CUDA architectures ({invalid_arch_list}) are "
             "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
-            f"({env_arch_list}). Supported CUDA/ROCM architectures are: "
+            f"({env_arch_list}). Supported CUDA architectures are: "
             f"{valid_archs}.",
             stacklevel=2)
     return arch_list
 
 
-# First, check the TORCH_CUDA_ARCH_LIST environment variable.
-compute_capabilities = get_torch_arch_list()
+if _is_hip():
+    rocm_arches = get_pytorch_rocm_arch()
+    NVCC_FLAGS += ["--offload-arch=" + arch for arch in rocm_arches]
+else:
+    # First, check the TORCH_CUDA_ARCH_LIST environment variable.
+    compute_capabilities = get_torch_arch_list()
+
 if _is_cuda() and not compute_capabilities:
     # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
     # GPUs on the current machine.
@@ -286,17 +319,6 @@ def get_torch_arch_list() -> Set[str]:
                     "nvcc": NVCC_FLAGS_PUNICA,
                 },
             ))
-elif _is_hip():
-    amd_archs = os.getenv("GPU_ARCHS")
-    if amd_archs is None:
-        amd_archs = get_amdgpu_offload_arch()
-    for arch in amd_archs.split(";"):
-        if arch not in ROCM_SUPPORTED_ARCHS:
-            raise RuntimeError(
-                f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}"
-                f"amdgpu_arch_found: {arch}")
-        NVCC_FLAGS += [f"--offload-arch={arch}"]
-
 elif _is_neuron():
     neuronxcc_version = get_neuronxcc_version()
 

From f0d4e145575bf6fb96c141d776ce92c9bfc79c49 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 5 Feb 2024 17:38:02 -0800
Subject: [PATCH 044/112] Add fused top-K softmax kernel for MoE (#2769)

---
 csrc/moe/moe_ops.cpp                    |   7 +
 csrc/moe/moe_ops.h                      |   9 +
 csrc/moe/topk_softmax_kernels.cu        | 499 ++++++++++++++++++++++++
 csrc/pybind.cpp                         |   2 +-
 setup.py                                |  11 +
 tests/kernels/test_moe.py               |  26 +-
 vllm/model_executor/layers/fused_moe.py |  58 ++-
 vllm/model_executor/models/deepseek.py  |  15 +-
 vllm/model_executor/models/mixtral.py   |  14 +-
 9 files changed, 591 insertions(+), 50 deletions(-)
 create mode 100644 csrc/moe/moe_ops.cpp
 create mode 100644 csrc/moe/moe_ops.h
 create mode 100644 csrc/moe/topk_softmax_kernels.cu

diff --git a/csrc/moe/moe_ops.cpp b/csrc/moe/moe_ops.cpp
new file mode 100644
index 0000000000000..35c328499a22d
--- /dev/null
+++ b/csrc/moe/moe_ops.cpp
@@ -0,0 +1,7 @@
+#include "moe_ops.h"
+
+#include <torch/extension.h>
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("topk_softmax", &topk_softmax, "Apply topk softmax to the gating outputs.");
+}
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
new file mode 100644
index 0000000000000..a01be3e426d72
--- /dev/null
+++ b/csrc/moe/moe_ops.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/extension.h>
+
+void topk_softmax(
+  torch::Tensor& topk_weights,
+  torch::Tensor& topk_indices,
+  torch::Tensor& token_expert_indices,
+  torch::Tensor& gating_output);
diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
new file mode 100644
index 0000000000000..8c65f40fe836a
--- /dev/null
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -0,0 +1,499 @@
+/*
+ * Adapted from https://github.com/NVIDIA/TensorRT-LLM/blob/v0.7.1/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
+ * Copyright (c) 2024, The vLLM team.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cub/cub.cuh>
+#include <cub/util_type.cuh>
+
+namespace vllm {
+namespace moe {
+
+static constexpr int WARP_SIZE = 32;
+
+/// Aligned array type
+template <
+    typename T,
+    /// Number of elements in the array
+    int N,
+    /// Alignment requirement in bytes
+    int Alignment = sizeof(T) * N
+>
+class alignas(Alignment) AlignedArray {
+    float data[N];
+};
+
+// ====================== Softmax things ===============================
+// We have our own implementation of softmax here so we can support transposing the output
+// in the softmax kernel when we extend this module to support expert-choice routing.
+template <int TPB>
+__launch_bounds__(TPB) __global__
+    void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols)
+{
+    using BlockReduce = cub::BlockReduce<float, TPB>;
+    __shared__ typename BlockReduce::TempStorage tmpStorage;
+
+    __shared__ float normalizing_factor;
+    __shared__ float float_max;
+
+    const int thread_row_offset = blockIdx.x * num_cols;
+
+    cub::Sum sum;
+    float threadData(-FLT_MAX);
+
+    // Don't touch finished rows.
+    if ((finished != nullptr) && finished[blockIdx.x])
+    {
+        return;
+    }
+
+    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
+    {
+        const int idx = thread_row_offset + ii;
+        threadData = max(static_cast<float>(input[idx]), threadData);
+    }
+
+    const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
+    if (threadIdx.x == 0)
+    {
+        float_max = maxElem;
+    }
+    __syncthreads();
+
+    threadData = 0;
+
+    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
+    {
+        const int idx = thread_row_offset + ii;
+        threadData += exp((static_cast<float>(input[idx]) - float_max));
+    }
+
+    const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
+
+    if (threadIdx.x == 0)
+    {
+        normalizing_factor = 1.f / Z;
+    }
+    __syncthreads();
+
+    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
+    {
+        const int idx = thread_row_offset + ii;
+        const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
+        output[idx] = val;
+    }
+}
+
+template <int TPB>
+__launch_bounds__(TPB) __global__ void moeTopK(const float* inputs_after_softmax, const bool* finished, float* output,
+    int* indices, int* source_rows, const int num_experts, const int k, const int start_expert, const int end_expert)
+{
+
+    using cub_kvp = cub::KeyValuePair<int, float>;
+    using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
+    __shared__ typename BlockReduce::TempStorage tmpStorage;
+
+    cub_kvp thread_kvp;
+    cub::ArgMax arg_max;
+
+    const int num_rows = gridDim.x;
+    const int block_row = blockIdx.x;
+
+    const bool row_is_active = finished ? !finished[block_row] : true;
+    const int thread_read_offset = blockIdx.x * num_experts;
+    for (int k_idx = 0; k_idx < k; ++k_idx)
+    {
+        thread_kvp.key = 0;
+        thread_kvp.value = -1.f; // This is OK because inputs are probabilities
+
+        cub_kvp inp_kvp;
+        for (int expert = threadIdx.x; expert < num_experts; expert += TPB)
+        {
+            const int idx = thread_read_offset + expert;
+            inp_kvp.key = expert;
+            inp_kvp.value = inputs_after_softmax[idx];
+
+            for (int prior_k = 0; prior_k < k_idx; ++prior_k)
+            {
+                const int prior_winning_expert = indices[k * block_row + prior_k];
+
+                if (prior_winning_expert == expert)
+                {
+                    inp_kvp = thread_kvp;
+                }
+            }
+
+            thread_kvp = arg_max(inp_kvp, thread_kvp);
+        }
+
+        const cub_kvp result_kvp = BlockReduce(tmpStorage).Reduce(thread_kvp, arg_max);
+        if (threadIdx.x == 0)
+        {
+            // Ignore experts the node isn't responsible for with expert parallelism
+            const int expert = result_kvp.key;
+            const bool node_uses_expert = expert >= start_expert && expert < end_expert;
+            const bool should_process_row = row_is_active && node_uses_expert;
+
+            const int idx = k * block_row + k_idx;
+            output[idx] = result_kvp.value;
+            indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
+            assert(indices[idx] >= 0);
+            source_rows[idx] = k_idx * num_rows + block_row;
+        }
+        __syncthreads();
+    }
+}
+
+// ====================== TopK softmax things ===============================
+
+/*
+  A Top-K gating softmax written to exploit when the number of experts in the MoE layers
+  are a small power of 2. This allows us to cleanly share the rows among the threads in
+  a single warp and eliminate communication between warps (so no need to use shared mem).
+
+  It fuses the softmax, max and argmax into a single kernel.
+
+  Limitations:
+  1) This implementation is intended for when the number of experts is a small power of 2.
+  2) This implementation assumes k is small, but will work for any k.
+*/
+
+template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG>
+__launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
+    void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, int* indices,
+        int* source_rows, const int k, const int start_expert, const int end_expert)
+{
+    // We begin by enforcing compile time assertions and setting up compile time constants.
+    static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
+    static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2");
+    static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
+    static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
+
+    // Number of bytes each thread pulls in per load
+    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
+    static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
+    static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
+    static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
+
+    // Restrictions based on previous section.
+    static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
+    static_assert(WARP_SIZE % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
+    static_assert(THREADS_PER_ROW == (THREADS_PER_ROW & -THREADS_PER_ROW), "THREADS_PER_ROW must be power of 2");
+    static_assert(THREADS_PER_ROW <= WARP_SIZE, "THREADS_PER_ROW can be at most warp size");
+
+    // We have NUM_EXPERTS elements per row. We specialize for small #experts
+    static constexpr int ELTS_PER_WARP = WARP_SIZE * VPT;
+    static constexpr int ROWS_PER_WARP = ELTS_PER_WARP / ELTS_PER_ROW;
+    static constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP;
+
+    // Restrictions for previous section.
+    static_assert(ELTS_PER_WARP % ELTS_PER_ROW == 0, "The elts per row must cleanly divide the total elt per warp");
+
+    // ===================== From this point, we finally start computing run-time variables. ========================
+
+    // Compute CTA and warp rows. We pack multiple rows into a single warp, and a block contains WARPS_PER_CTA warps.
+    // This, each block processes a chunk of rows. We start by computing the start row for each block.
+    const int cta_base_row = blockIdx.x * ROWS_PER_CTA;
+
+    // Now, using the base row per thread block, we compute the base row per warp.
+    const int warp_base_row = cta_base_row + threadIdx.y * ROWS_PER_WARP;
+
+    // The threads in a warp are split into sub-groups that will work on a row.
+    // We compute row offset for each thread sub-group
+    const int thread_row_in_warp = threadIdx.x / THREADS_PER_ROW;
+    const int thread_row = warp_base_row + thread_row_in_warp;
+
+    // Threads with indices out of bounds should early exit here.
+    if (thread_row >= num_rows)
+    {
+        return;
+    }
+    const bool row_is_active = finished ? !finished[thread_row] : true;
+
+    // We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
+    // row it will read.
+    const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
+
+    // Now, we compute the group each thread belong to in order to determine the first column to start loads.
+    const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
+    const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
+    const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
+
+    // Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
+    // this can support all powers of 2 up to 16.
+    // NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
+    // We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
+    using AccessType = AlignedArray<float, ELTS_PER_LDG>;
+
+    // Finally, we pull in the data from global mem
+    float row_chunk[VPT];
+    AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
+    const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
+#pragma unroll
+    for (int ii = 0; ii < LDG_PER_THREAD; ++ii)
+    {
+        row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+    }
+
+    // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
+    // convert to float afterwards for the exp + sum reduction.
+    float thread_max = row_chunk[0];
+#pragma unroll
+    for (int ii = 1; ii < VPT; ++ii)
+    {
+        thread_max = max(thread_max, row_chunk[ii]);
+    }
+
+// Now, we find the max within the thread group and distribute among the threads. We use a butterfly reduce.
+#pragma unroll
+    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+    {
+        thread_max = max(thread_max, __shfl_xor_sync(0xFFFFFFFF, thread_max, mask, THREADS_PER_ROW));
+    }
+
+    // From this point, thread max in all the threads have the max within the row.
+    // Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
+    float row_sum = 0;
+#pragma unroll
+    for (int ii = 0; ii < VPT; ++ii)
+    {
+        row_chunk[ii] = expf(row_chunk[ii] - thread_max);
+        row_sum += row_chunk[ii];
+    }
+
+// Now, we perform the sum reduce within each thread group. Similar to the max reduce, we use a bufferfly pattern.
+#pragma unroll
+    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+    {
+        row_sum += __shfl_xor_sync(0xFFFFFFFF, row_sum, mask, THREADS_PER_ROW);
+    }
+
+    // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
+    // respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
+    // compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
+    // However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
+    // argmax after computing the softmax.
+    const float reciprocal_row_sum = 1.f / row_sum;
+
+#pragma unroll
+    for (int ii = 0; ii < VPT; ++ii)
+    {
+        row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum;
+    }
+
+    // Now, softmax_res contains the softmax of the row chunk. Now, I want to find the topk elements in each row, along
+    // with the max index.
+    int start_col = first_elt_read_by_thread;
+    static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
+
+    for (int k_idx = 0; k_idx < k; ++k_idx)
+    {
+        // First, each thread does the local argmax
+        float max_val = row_chunk[0];
+        int expert = start_col;
+#pragma unroll
+        for (int ldg = 0, col = start_col; ldg < LDG_PER_THREAD; ++ldg, col += COLS_PER_GROUP_LDG)
+        {
+#pragma unroll
+            for (int ii = 0; ii < ELTS_PER_LDG; ++ii)
+            {
+                float val = row_chunk[ldg * ELTS_PER_LDG + ii];
+
+                // No check on the experts here since columns with the smallest index are processed first and only
+                // updated if > (not >=)
+                if (val > max_val)
+                {
+                    max_val = val;
+                    expert = col + ii;
+                }
+            }
+        }
+
+// Now, we perform the argmax reduce. We use the butterfly pattern so threads reach consensus about the max.
+// This will be useful for K > 1 so that the threads can agree on "who" had the max value. That thread can
+// then blank out their max with -inf and the warp can run more iterations...
+#pragma unroll
+        for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+        {
+            float other_max = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, THREADS_PER_ROW);
+            int other_expert = __shfl_xor_sync(0xFFFFFFFF, expert, mask, THREADS_PER_ROW);
+
+            // We want lower indices to "win" in every thread so we break ties this way
+            if (other_max > max_val || (other_max == max_val && other_expert < expert))
+            {
+                max_val = other_max;
+                expert = other_expert;
+            }
+        }
+
+        // Write the max for this k iteration to global memory.
+        if (thread_group_idx == 0)
+        {
+            // Add a guard to ignore experts not included by this node
+            const bool node_uses_expert = expert >= start_expert && expert < end_expert;
+            const bool should_process_row = row_is_active && node_uses_expert;
+
+            // The lead thread from each sub-group will write out the final results to global memory. (This will be a
+            // single) thread per row of the input/output matrices.
+            const int idx = k * thread_row + k_idx;
+            output[idx] = max_val;
+            indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
+            source_rows[idx] = k_idx * num_rows + thread_row;
+        }
+
+        // Finally, we clear the value in the thread with the current max if there is another iteration to run.
+        if (k_idx + 1 < k)
+        {
+            const int ldg_group_for_expert = expert / COLS_PER_GROUP_LDG;
+            const int thread_to_clear_in_group = (expert / ELTS_PER_LDG) % THREADS_PER_ROW;
+
+            // Only the thread in the group which produced the max will reset the "winning" value to -inf.
+            if (thread_group_idx == thread_to_clear_in_group)
+            {
+                const int offset_for_expert = expert % ELTS_PER_LDG;
+                // Safe to set to any negative value since row_chunk values must be between 0 and 1.
+                row_chunk[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
+            }
+        }
+    }
+}
+
+namespace detail
+{
+// Constructs some constants needed to partition the work across threads at compile time.
+template <int EXPERTS, int BYTES_PER_LDG>
+struct TopkConstants
+{
+    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
+    static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
+    static constexpr int VECs_PER_THREAD = std::max(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
+    static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
+    static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
+    static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
+};
+} // namespace detail
+
+template <int EXPERTS, int WARPS_PER_TB>
+void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, int* indices,
+    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
+{
+    static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
+
+    static constexpr int BYTES_PER_LDG = std::min(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
+    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG>;
+    static constexpr int VPT = Constants::VPT;
+    static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
+    const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
+    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
+
+    dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
+    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG><<<num_blocks, block_dim, 0, stream>>>(
+        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
+}
+
+#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB)                       \
+    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>(         \
+        gating_output, nullptr, topk_weights, topk_indicies,            \
+        token_expert_indices, num_tokens, topk, 0, num_experts,         \
+        stream);
+
+void topkGatingSoftmaxKernelLauncher(
+    const float* gating_output,
+    float* topk_weights,
+    int* topk_indicies,
+    int* token_expert_indices,
+    float* softmax_workspace,
+    const int num_tokens,
+    const int num_experts,
+    const int topk,
+    cudaStream_t stream) {
+    static constexpr int WARPS_PER_TB = 4;
+    switch (num_experts) {
+        case 1:
+            LAUNCH_SOFTMAX(1, WARPS_PER_TB);
+            break;
+        case 2:
+            LAUNCH_SOFTMAX(2, WARPS_PER_TB);
+            break;
+        case 4:
+            LAUNCH_SOFTMAX(4, WARPS_PER_TB);
+            break;
+        case 8:
+            LAUNCH_SOFTMAX(8, WARPS_PER_TB);
+            break;
+        case 16:
+            LAUNCH_SOFTMAX(16, WARPS_PER_TB);
+            break;
+        case 32:
+            LAUNCH_SOFTMAX(32, WARPS_PER_TB);
+            break;
+        case 64:
+            LAUNCH_SOFTMAX(64, WARPS_PER_TB);
+            break;
+        case 128:
+            LAUNCH_SOFTMAX(128, WARPS_PER_TB);
+            break;
+        case 256:
+            LAUNCH_SOFTMAX(256, WARPS_PER_TB);
+            break;
+        default: {
+            TORCH_CHECK(softmax_workspace != nullptr,
+                "softmax_workspace must be provided for num_experts that are not a power of 2.");
+            static constexpr int TPB = 256;
+            moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
+                gating_output, nullptr, softmax_workspace, num_experts);
+            moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
+                softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices,
+                num_experts, topk, 0, num_experts);
+        }
+    }
+}
+
+} // namespace moe
+} // namespace vllm
+
+void topk_softmax(
+    torch::Tensor& topk_weights,                // [num_tokens, topk]
+    torch::Tensor& topk_indices,                // [num_tokens, topk]
+    torch::Tensor& token_expert_indices,        // [num_tokens, topk]
+    torch::Tensor& gating_output)               // [num_tokens, num_experts]
+{
+    const int num_experts = gating_output.size(-1);
+    const int num_tokens = gating_output.numel() / num_experts;
+    const int topk = topk_weights.size(-1);
+
+    const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
+    const bool needs_workspace = !is_pow_2 || num_experts > 256;
+    const int64_t workspace_size = needs_workspace ? num_tokens * num_experts : 0;
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
+    vllm::moe::topkGatingSoftmaxKernelLauncher(
+        gating_output.data_ptr<float>(),
+        topk_weights.data_ptr<float>(),
+        topk_indices.data_ptr<int>(),
+        token_expert_indices.data_ptr<int>(),
+        softmax_workspace.data_ptr<float>(),
+        num_tokens,
+        num_experts,
+        topk,
+        stream);
+}
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index 8a8235691ab8e..b36d259697167 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -48,8 +48,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     &rotary_embedding,
     "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
 
-#ifndef USE_ROCM
   // Quantization ops
+#ifndef USE_ROCM
   ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
   ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
 #endif
diff --git a/setup.py b/setup.py
index 0c4937da210ef..9cc4aea0ea75a 100644
--- a/setup.py
+++ b/setup.py
@@ -339,6 +339,17 @@ def get_torch_arch_list() -> Set[str]:
     vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
     vllm_extension_sources.append("csrc/custom_all_reduce.cu")
 
+    # Add MoE kernels.
+    ext_modules.append(
+        CUDAExtension(
+            name="vllm._moe_C",
+            sources=glob("csrc/moe/*.cu") + glob("csrc/moe/*.cpp"),
+            extra_compile_args={
+                "cxx": CXX_FLAGS,
+                "nvcc": NVCC_FLAGS,
+            },
+        ))
+
 if not _is_neuron():
     vllm_extension = CUDAExtension(
         name="vllm._C",
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 227ddfc3661b3..c402fe3e98c7f 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -2,10 +2,8 @@
 
 Run `pytest tests/kernels/test_moe.py`.
 """
-
 import pytest
 import torch
-
 from transformers import MixtralConfig
 from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
@@ -14,22 +12,21 @@
 from vllm.model_executor.models.mixtral import MixtralMoE
 
 
-def torch_moe(a, w1, w2, topk_weight, topk_ids):
+def torch_moe(a, w1, w2, score, topk):
     B, D = a.shape
-    a = a.view(B, -1, D).repeat(1, topk_ids.shape[1], 1).reshape(-1, D)
-    out = torch.zeros(B * topk_ids.shape[1],
-                      w2.shape[1],
-                      dtype=a.dtype,
-                      device=a.device)
-    topk_ids = topk_ids.view(-1)
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
     topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
     for i in range(w1.shape[0]):
         mask = topk_ids == i
         if mask.sum():
             out[mask] = SiluAndMul()(
                 a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
     return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1)).sum(dim=1)
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 
 
 @pytest.mark.parametrize("m", [512, 222, 33, 1])
@@ -51,11 +48,8 @@ def test_fused_moe(
     w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
 
     score = torch.randn((m, e), device='cuda', dtype=dtype)
-    score = torch.softmax(score, dim=-1)
-    topk_weight, topk_ids = torch.topk(score, topk)
-
-    triton_output = fused_moe(a, w1, w2, topk_weight, topk_ids, False)
-    torch_output = torch_moe(a, w1, w2, topk_weight, topk_ids)
+    triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
+    torch_output = torch_moe(a, w1, w2, score, topk)
     assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
 
 
@@ -75,7 +69,7 @@ def test_mixtral_moe(dtype: torch.dtype):
         intermediate_size=config.intermediate_size,
         params_dtype=dtype,
         tp_size=1,
-    )
+    ).cuda()
 
     # Load the weights
     vllm_moe.gate.linear_weights["weight"][:] = hf_moe.gate.weight.data
diff --git a/vllm/model_executor/layers/fused_moe.py b/vllm/model_executor/layers/fused_moe.py
index eed2e83bed7f8..bc3aef1887ef8 100644
--- a/vllm/model_executor/layers/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe.py
@@ -4,6 +4,7 @@
 import triton.language as tl
 
 from vllm._C import ops
+from vllm.utils import is_hip
 
 
 @triton.jit
@@ -177,7 +178,6 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
                             expert_ids: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
                             mul_routed_weight: bool, top_k: int, config: dict):
-
     assert topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
 
@@ -210,12 +210,15 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
     )
 
 
-def fused_moe(hidden_states: torch.Tensor,
-              w1: torch.Tensor,
-              w2: torch.Tensor,
-              topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor,
-              inplace=False):
+def fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    inplace: bool = False,
+) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2, and top-k gating mechanism.
     
@@ -223,15 +226,19 @@ def fused_moe(hidden_states: torch.Tensor,
     - hidden_states (torch.Tensor): The input tensor to the MoE layer.
     - w1 (torch.Tensor): The first set of expert weights.
     - w2 (torch.Tensor): The second set of expert weights.
-    - topk_weights (torch.Tensor): The weights for the top-k selected experts.
-    - topk_ids (torch.Tensor): The indices of the top-k selected experts.
+    - gating_output (torch.Tensor): The output of the gating operation (before softmax).
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
     - inplace (bool): If True, perform the operation in-place. Defaults to False.
     
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
     """
     # Check constraints.
-    assert hidden_states.shape[1] == w1.shape[2], "Incompatible dimensions"
+    assert hidden_states.shape[0] == gating_output.shape[0], (
+        "Number of tokens mismatch")
+    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
     assert w2.is_contiguous(), "Expert weights2 must be contiguous"
@@ -241,6 +248,37 @@ def fused_moe(hidden_states: torch.Tensor,
     M, _ = hidden_states.shape
     E, N, _ = w1.shape
 
+    if is_hip():
+        # The MoE kernels are not yet supported on ROCm.
+        routing_weights = torch.softmax(gating_output,
+                                        dim=-1,
+                                        dtype=torch.float32)
+        topk_weights, topk_ids = torch.topk(routing_weights, topk, dim=-1)
+    else:
+        import vllm._moe_C as moe_kernels
+
+        topk_weights = torch.empty(M,
+                                   topk,
+                                   dtype=torch.float32,
+                                   device=hidden_states.device)
+        topk_ids = torch.empty(M,
+                               topk,
+                               dtype=torch.int32,
+                               device=hidden_states.device)
+        token_expert_indicies = torch.empty(M,
+                                            topk,
+                                            dtype=torch.int32,
+                                            device=hidden_states.device)
+        moe_kernels.topk_softmax(
+            topk_weights,
+            topk_ids,
+            token_expert_indicies,
+            gating_output.float(),  # TODO(woosuk): Optimize this.
+        )
+        del token_expert_indicies  # Not used. Will be used in the future.
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
     config = {
         'BLOCK_SIZE_M': 64,
         'BLOCK_SIZE_N': 64,
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index fc727b8e661b3..6dba952736921 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -25,7 +25,6 @@
 
 import torch
 from torch import nn
-import torch.nn.functional as F
 from transformers import PretrainedConfig
 
 from vllm.model_executor.input_metadata import InputMetadata
@@ -155,20 +154,12 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             shared_output = self.shared_experts(hidden_states)
         # router_logits: (batch * sequence_length, n_experts)
         router_logits, _ = self.gate(hidden_states)
-
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(routing_weights,
-                                                       self.top_k,
-                                                       dim=-1)
-
-        if self.config.norm_topk_prob:
-            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-
         final_hidden_states = fused_moe(hidden_states,
                                         self.w1,
                                         self.w2,
-                                        routing_weights,
-                                        selected_experts,
+                                        router_logits,
+                                        self.top_k,
+                                        renormalize=self.config.norm_topk_prob,
                                         inplace=True)
 
         if self.config.n_shared_experts is not None:
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index a8e470395b904..aeb9d087e954a 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -24,8 +24,6 @@
 from typing import List, Optional, Tuple
 
 import torch
-import torch.nn.functional as F
-
 from torch import nn
 from transformers import MixtralConfig
 
@@ -128,18 +126,12 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (batch * sequence_length, n_experts)
         router_logits, _ = self.gate(hidden_states)
-
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(routing_weights,
-                                                       self.top_k,
-                                                       dim=-1)
-        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-
         final_hidden_states = fused_moe(hidden_states,
                                         self.ws,
                                         self.w2s,
-                                        routing_weights,
-                                        selected_experts,
+                                        router_logits,
+                                        self.top_k,
+                                        renormalize=True,
                                         inplace=True)
 
         if self.tp_size > 1:

From ed70c70ea3569670499717f06d117ed25ec32af4 Mon Sep 17 00:00:00 2001
From: liuyhwangyh <liuyhwangyh@163.com>
Date: Wed, 7 Feb 2024 01:57:15 +0800
Subject: [PATCH 045/112] modelscope: fix issue when model parameter is not a
 model id but path of the model. (#2489)

---
 vllm/config.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 1dfc0d63c8813..c35b6302b2cfa 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -93,9 +93,12 @@ def __init__(
             # download model from ModelScope hub,
             # lazy import so that modelscope is not required for normal use.
             from modelscope.hub.snapshot_download import snapshot_download  # pylint: disable=C
-            model_path = snapshot_download(model_id=model,
-                                           cache_dir=download_dir,
-                                           revision=revision)
+            if not os.path.exists(model):
+                model_path = snapshot_download(model_id=model,
+                                               cache_dir=download_dir,
+                                               revision=revision)
+            else:
+                model_path = model
             self.model = model_path
             self.download_dir = model_path
             self.tokenizer = model_path

From fe6d09ae61f2281417e35f53a948b6fa898a4eba Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Tue, 6 Feb 2024 11:38:38 -0800
Subject: [PATCH 046/112] [Minor] More fix of test_cache.py CI test failure
 (#2750)

---
 tests/kernels/test_cache.py |  9 ++++-----
 vllm/utils.py               | 18 ++++++++++++------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index e0368d926d51a..d8dc74bc7b003 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -181,16 +181,15 @@ def test_swap_blocks(
     num_blocks: int,
     dtype: torch.dtype,
     seed: int,
-    device: int,
+    device: str,
 ) -> None:
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
-    src_device = f"{direction[0]}:{device}" if direction[
-        0] == "cuda" else direction[0]
-    dst_device = f"{direction[1]}:{device}" if direction[
-        1] == "cuda" else direction[1]
+
+    src_device = device if direction[0] == "cuda" else 'cpu'
+    dst_device = device if direction[1] == "cuda" else 'cpu'
 
     src_blocks = random.sample(range(num_blocks), num_mappings)
     # For the same device, mapping must not overlap
diff --git a/vllm/utils.py b/vllm/utils.py
index 9e9126a2d6377..d7a3a3a2a9ef9 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -258,10 +258,13 @@ def create_kv_caches_with_random(
         key_cache = torch.empty(size=key_cache_shape,
                                 dtype=torch_dtype,
                                 device=device)
-        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
-            key_cache.uniform_(-scale, scale)
-        elif cache_dtype == 'fp8_e5m2':
+        if cache_dtype == 'fp8_e5m2':
             _generate_random_fp8_e5m2(key_cache, -scale, scale)
+        elif torch_dtype in [torch.half, torch.bfloat16, torch.float]:
+            key_cache.uniform_(-scale, scale)
+        else:
+            raise ValueError(
+                f"Does not support key cache of type {cache_dtype}")
         key_caches.append(key_cache)
 
     value_cache_shape = (num_blocks, num_heads, head_size, block_size)
@@ -270,9 +273,12 @@ def create_kv_caches_with_random(
         value_cache = torch.empty(size=value_cache_shape,
                                   dtype=torch_dtype,
                                   device=device)
-        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
-            value_cache.uniform_(-scale, scale)
-        elif cache_dtype == 'fp8_e5m2':
+        if cache_dtype == 'fp8_e5m2':
             _generate_random_fp8_e5m2(value_cache, -scale, scale)
+        elif torch_dtype in [torch.half, torch.bfloat16, torch.float]:
+            value_cache.uniform_(-scale, scale)
+        else:
+            raise ValueError(
+                f"Does not support value cache of type {cache_dtype}")
         value_caches.append(value_cache)
     return key_caches, value_caches

From c81dddb45c71e630b907f9d84686ecd73b4105c7 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Wed, 7 Feb 2024 01:36:59 -0500
Subject: [PATCH 047/112] [ROCm] Fix build problem resulted from previous
 commit related to FP8 kv-cache support  (#2790)

---
 Dockerfile.rocm            |  1 +
 rocm_patch/rocm_bf16.patch | 15 +++++++++++++++
 2 files changed, 16 insertions(+)
 create mode 100644 rocm_patch/rocm_bf16.patch

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 3c76305303037..f49b321372ed0 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -76,6 +76,7 @@ RUN cd /app \
     && cd vllm \
     && pip install -U -r requirements-rocm.txt \
     && bash patch_xformers.rocm.sh \
+    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
     && python3 setup.py install \
     && cd ..
 
diff --git a/rocm_patch/rocm_bf16.patch b/rocm_patch/rocm_bf16.patch
new file mode 100644
index 0000000000000..a0f07da2a3e2b
--- /dev/null
+++ b/rocm_patch/rocm_bf16.patch
@@ -0,0 +1,15 @@
+--- amd_hip_bf16.h	2024-02-06 18:28:58.268699142 +0000
++++ amd_hip_bf16.h.new	2024-02-06 18:28:31.988647133 +0000
+@@ -90,10 +90,10 @@
+ #include "math_fwd.h"              // ocml device functions
+ 
+ #if defined(__HIPCC_RTC__)
+-#define __HOST_DEVICE__ __device__
++#define __HOST_DEVICE__ __device__ static
+ #else
+ #include <climits>
+-#define __HOST_DEVICE__ __host__ __device__
++#define __HOST_DEVICE__ __host__ __device__ static inline
+ #endif
+ 
+ // Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on

From 931746bc6d7c1c0ab40b2c4f58b51b855f0b2c94 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 7 Feb 2024 14:42:02 -0800
Subject: [PATCH 048/112] Add documentation on how to do incremental builds
 (#2796)

---
 docs/source/getting_started/installation.rst | 10 ++++++++++
 setup.py                                     |  5 +++++
 2 files changed, 15 insertions(+)

diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index 911c3d8f9a4ab..77b0ae65838a8 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -67,3 +67,13 @@ You can also build and install vLLM from source:
 
         $ # Use `--ipc=host` to make sure the shared memory is large enough.
         $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
+
+.. note::
+    If you are developing the C++ backend of vLLM, consider building vLLM with
+
+    .. code-block:: console
+
+        $ python setup.py develop
+
+    since it will give you incremental builds. The downside is that this method
+    is `deprecated by setuptools <https://github.com/pypa/setuptools/issues/917>`_.
diff --git a/setup.py b/setup.py
index 9cc4aea0ea75a..60efed0720ff1 100644
--- a/setup.py
+++ b/setup.py
@@ -15,6 +15,11 @@
 
 ROOT_DIR = os.path.dirname(__file__)
 
+# If you are developing the C++ backend of vLLM, consider building vLLM with
+# `python setup.py develop` since it will give you incremental builds.
+# The downside is that this method is deprecated, see
+# https://github.com/pypa/setuptools/issues/917
+
 MAIN_CUDA_VERSION = "12.1"
 
 # Supported NVIDIA GPU architectures.

From 65b89d16ee54063d0737e3da1b15e2177916118d Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Fri, 9 Feb 2024 02:57:25 +0900
Subject: [PATCH 049/112] [Ray] Integration compiled DAG off by default (#2471)

---
 vllm/engine/llm_engine.py | 62 ++++++++++++++++++++++++++++++++++-----
 vllm/engine/ray_utils.py  | 18 ++++++++++++
 2 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 02c673c96fd9a..03a2b1157652b 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -2,6 +2,7 @@
 from collections import defaultdict
 import os
 import time
+import pickle
 from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple,
                     Union)
 
@@ -30,6 +31,11 @@
 logger = init_logger(__name__)
 _LOCAL_LOGGING_INTERVAL_SEC = 5
 
+# If the env var is set, it uses the Ray's compiled DAG API
+# which optimizes the control plane overhead.
+# Run VLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0))
+
 
 class LLMEngine:
     """An LLM engine that receives requests and generates texts.
@@ -124,6 +130,10 @@ def __init__(
             self.stat_logger = StatLogger(
                 local_interval=_LOCAL_LOGGING_INTERVAL_SEC)
 
+        self.forward_dag = None
+        if USE_RAY_COMPILED_DAG:
+            self.forward_dag = self._compiled_ray_dag()
+
     def get_tokenizer_for_seq(self, sequence: Sequence):
         return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
 
@@ -806,7 +816,8 @@ def step(self) -> List[RequestOutput]:
                     "blocks_to_swap_in": scheduler_outputs.blocks_to_swap_in,
                     "blocks_to_swap_out": scheduler_outputs.blocks_to_swap_out,
                     "blocks_to_copy": scheduler_outputs.blocks_to_copy,
-                })
+                },
+                use_ray_compiled_dag=USE_RAY_COMPILED_DAG)
 
             # Only the driver worker returns the sampling results.
             output = all_outputs[0]
@@ -966,6 +977,7 @@ def _run_workers(
         driver_args: Optional[List[Any]] = None,
         driver_kwargs: Optional[Dict[str, Any]] = None,
         max_concurrent_workers: Optional[int] = None,
+        use_ray_compiled_dag: bool = False,
         **kwargs,
     ) -> Any:
         """Runs the given method on all workers."""
@@ -974,11 +986,16 @@ def _run_workers(
             raise NotImplementedError(
                 "max_concurrent_workers is not supported yet.")
 
-        # Start the ray workers first.
-        ray_worker_outputs = [
-            worker.execute_method.remote(method, *args, **kwargs)
-            for worker in self.workers
-        ]
+        if use_ray_compiled_dag:
+            # Right now, compiled DAG can only accept a single
+            # input. TODO(sang): Fix it.
+            output_channels = self.forward_dag.execute(1)
+        else:
+            # Start the ray workers first.
+            ray_worker_outputs = [
+                worker.execute_method.remote(method, *args, **kwargs)
+                for worker in self.workers
+            ]
 
         if driver_args is None:
             driver_args = args
@@ -991,6 +1008,37 @@ def _run_workers(
 
         # Get the results of the ray workers.
         if self.workers:
-            ray_worker_outputs = ray.get(ray_worker_outputs)
+            if use_ray_compiled_dag:
+                try:
+                    ray_worker_outputs = [
+                        pickle.loads(chan.begin_read())
+                        for chan in output_channels
+                    ]
+                finally:
+                    # Has to call end_read in order to reuse the DAG.
+                    for chan in output_channels:
+                        chan.end_read()
+            else:
+                ray_worker_outputs = ray.get(ray_worker_outputs)
 
         return [driver_worker_output] + ray_worker_outputs
+
+    def _compiled_ray_dag(self):
+        import pkg_resources
+        required_version = "2.9"
+        current_version = pkg_resources.get_distribution("ray").version
+        if current_version < required_version:
+            raise ValueError(f"Ray version {required_version} or greater is "
+                             f"required, but found {current_version}")
+
+        from ray.dag import MultiOutputNode, InputNode
+        assert self.parallel_config.worker_use_ray
+
+        # Right now, compiled DAG requires at least 1 arg. We send
+        # a dummy value for now. It will be fixed soon.
+        with InputNode() as input_data:
+            forward_dag = MultiOutputNode([
+                worker.execute_model_compiled_dag_remote.bind(input_data)
+                for worker in self.workers
+            ])
+        return forward_dag.experimental_compile()
diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py
index afbc33ed19a0c..bbcbbdfea2f00 100644
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
@@ -1,3 +1,5 @@
+import pickle
+
 from typing import Optional, List, Tuple, TYPE_CHECKING
 
 from vllm.config import ParallelConfig
@@ -18,6 +20,11 @@ def __init__(self, init_cached_hf_modules=False) -> None:
                 from transformers.dynamic_module_utils import init_hf_modules
                 init_hf_modules()
             self.worker = None
+            # Since the compiled DAG runs a main execution
+            # in a different thread that calls cuda.set_device.
+            # The flag indicates is set_device is called on
+            # that thread.
+            self.compiled_dag_cuda_device_set = False
 
         def init_worker(self, worker_init_fn):
             self.worker = worker_init_fn()
@@ -40,6 +47,17 @@ def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
         def set_cuda_visible_devices(self, device_ids) -> None:
             set_cuda_visible_devices(device_ids)
 
+        def execute_model_compiled_dag_remote(self, ignored):
+            """Used only when compiled DAG is enabled."""
+            import torch
+            if not self.compiled_dag_cuda_device_set:
+                torch.cuda.set_device(self.worker.device)
+                self.compiled_dag_cuda_device_set = True
+
+            output = self.worker.execute_model()
+            output = pickle.dumps(output)
+            return output
+
 except ImportError as e:
     logger.warning(f"Failed to import Ray with {e!r}. "
                    "For distributed inference, please install Ray with "

From 3711811b1d2956e83e626c72f0e1607f2dfbc8fb Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 8 Feb 2024 09:58:03 -0800
Subject: [PATCH 050/112] Disable custom all reduce by default (#2808)

---
 vllm/config.py | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index c35b6302b2cfa..27c61d4d50439 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -388,16 +388,26 @@ def _verify_args(self) -> None:
         if self.pipeline_parallel_size > 1:
             raise NotImplementedError(
                 "Pipeline parallelism is not supported yet.")
-        if is_hip():
+        if not self.disable_custom_all_reduce and self.world_size > 1:
+            if is_hip():
+                self.disable_custom_all_reduce = True
+                logger.info(
+                    "Disabled the custom all-reduce kernel because it is not "
+                    "supported on AMD GPUs.")
+            elif self.pipeline_parallel_size > 1:
+                self.disable_custom_all_reduce = True
+                logger.info(
+                    "Disabled the custom all-reduce kernel because it is not "
+                    "supported with pipeline parallelism.")
+
+        # FIXME(woosuk): Fix the stability issues and re-enable the custom
+        # all-reduce kernel.
+        if not self.disable_custom_all_reduce and self.world_size > 1:
             self.disable_custom_all_reduce = True
             logger.info(
-                "Disabled the custom all-reduce kernel because it is not "
-                "supported on AMD GPUs.")
-        elif self.pipeline_parallel_size > 1:
-            self.disable_custom_all_reduce = True
-            logger.info(
-                "Disabled the custom all-reduce kernel because it is not "
-                "supported with pipeline parallelism.")
+                "Custom all-reduce kernels are temporarily disabled due to "
+                "stability issues. We will re-enable them once the issues are "
+                "resolved.")
 
 
 class SchedulerConfig:

From 0580aab02ffe60fee50bddc80b787828eb233c44 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Sun, 11 Feb 2024 02:14:37 -0500
Subject: [PATCH 051/112] =?UTF-8?q?[ROCm]=20support=20Radeon=E2=84=A2=2079?=
 =?UTF-8?q?00=20series=20(gfx1100)=20without=20using=20flash-attention=20(?=
 =?UTF-8?q?#2768)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Dockerfile.rocm                               | 15 +++++--
 .../getting_started/amd-installation.rst      |  3 +-
 setup.py                                      |  2 +-
 vllm/model_executor/layers/attention.py       | 45 +++++++++++++++++++
 4 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index f49b321372ed0..e0ef4a0f4131a 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -17,6 +17,12 @@ RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
 ARG FA_BRANCH="3d2b6f5"
 RUN echo "FA_BRANCH is $FA_BRANCH"
 
+# whether to build flash-attention
+# if 0, will not build flash attention
+# this is useful for gfx target where flash-attention is not supported
+# In that case, we need to use the python reference attention implementation in vllm
+ARG BUILD_FA="1"
+
 # Install some basic utilities
 RUN apt-get update && apt-get install python3 python3-pip -y
 
@@ -50,7 +56,8 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
 ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
 
 # Install ROCm flash-attention
-RUN mkdir libs \
+RUN if [ "$BUILD_FA" == "1" ]; then \
+    mkdir libs \
     && cd libs \
     && git clone https://github.com/ROCmSoftwarePlatform/flash-attention.git \
     && cd flash-attention \
@@ -60,7 +67,8 @@ RUN mkdir libs \
     && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
         patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
     && python3 setup.py install \
-    && cd ..
+    && cd ..; \
+    fi
 
 COPY ./ /app/vllm
 
@@ -75,7 +83,8 @@ RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
 RUN cd /app \
     && cd vllm \
     && pip install -U -r requirements-rocm.txt \
-    && bash patch_xformers.rocm.sh \
+    && if [ "$BUILD_FA" == "1" ]; then \
+       bash patch_xformers.rocm.sh; fi \
     && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
     && python3 setup.py install \
     && cd ..
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index 6851ba136351c..5d9fdf4056709 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -12,7 +12,7 @@ Requirements
 
 * OS: Linux
 * Python: 3.8 -- 3.11
-* GPU: MI200s (gfx90a), MI300 (gfx942)
+* GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
 * Pytorch 2.0.1/2.1.1/2.2
 * ROCm 5.7 (Verified on python 3.10) or ROCm 6.0 (Verified on python 3.9)
 
@@ -105,6 +105,7 @@ The `Dokerfile.rocm` is designed to support both ROCm 5.7 and ROCm 6.0 and later
 * `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`
 * `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
 * `FA_BRANCH`: specifies the branch used to build the flash-attention in `ROCmSoftwarePlatform's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `3d2b6f5`
+* `BUILD_FA`: specifies whether to build flash-attention. For `Radeon RX 7900 series (gfx1100) <https://rocm.docs.amd.com/projects/radeon/en/latest/index.html>`_, this should be set to 0 before flash-attention supports this target.
 
 Their values can be passed in when running ``docker build`` with ``--build-arg`` options.
 
diff --git a/setup.py b/setup.py
index 60efed0720ff1..ea58a1a49e7e3 100644
--- a/setup.py
+++ b/setup.py
@@ -24,7 +24,7 @@
 
 # Supported NVIDIA GPU architectures.
 NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
-ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx942"}
+ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx942", "gfx1100"}
 # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
 
 
diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
index 2ce9d60f08d80..0622a54db1bc0 100644
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -1,6 +1,7 @@
 """Multi-head attention."""
 from typing import List, Optional
 
+import importlib
 import torch
 import torch.nn as nn
 from xformers import ops as xops
@@ -58,6 +59,40 @@ def __init__(
             raise ValueError(f"head_size ({self.head_size}) is not supported. "
                              f"Supported head sizes: {_SUPPORTED_HEAD_SIZES}.")
 
+        self.use_ref_attention = self.check_use_ref_attention()
+
+    def check_use_ref_attention(self) -> bool:
+        if not is_hip():
+            return False
+        # For ROCm, check whether flash attention is installed or not.
+        # if not, use_ref_attention needs to be True
+        return importlib.util.find_spec("flash_attn") is None
+
+    def ref_masked_attention(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+    ) -> torch.Tensor:
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+
+        seq_len, _, _ = query.shape
+        attn_mask = torch.triu(torch.ones(seq_len,
+                                          seq_len,
+                                          dtype=query.dtype,
+                                          device=query.device),
+                               diagonal=1)
+        attn_mask = attn_mask * torch.finfo(query.dtype).min
+
+        attn_weights = self.scale * torch.einsum("qhd,khd->hqk", query,
+                                                 key).float()
+        attn_weights = attn_weights + attn_mask.float()
+        attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+        out = torch.einsum("hqk,khd->qhd", attn_weights, value)
+        return out
+
     def forward(
         self,
         query: torch.Tensor,
@@ -137,6 +172,16 @@ def forward(
                             self.alibi_slopes, self.num_kv_heads, batch_size,
                             seq_len, query.dtype)
 
+                if self.use_ref_attention:
+                    output = self.ref_masked_attention(
+                        query,
+                        key,
+                        value,
+                    )
+                    # Using view got RuntimeError: view size is not compatible with input tensor's size and stride
+                    # (at least one dimension spans across two contiguous subspaces). Use reshape instead
+                    return output.reshape(batch_size, seq_len, hidden_size)
+
                 # TODO(woosuk): Too many view operations. Let's try to reduce
                 # them in the future for code readability.
                 if self.alibi_slopes is None:

From 4ca2c358b178d5e026db925a1ed9f8945010a98f Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Mon, 12 Feb 2024 08:24:45 -0800
Subject: [PATCH 052/112] Add documentation section about LoRA (#2834)

---
 docs/source/index.rst       |  1 +
 docs/source/models/lora.rst | 52 +++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)
 create mode 100644 docs/source/models/lora.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 3e2331907f0f2..9b53a643b8d46 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -82,6 +82,7 @@ Documentation
    models/supported_models
    models/adding_model
    models/engine_args
+   models/lora
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst
new file mode 100644
index 0000000000000..b773edfc6ff2b
--- /dev/null
+++ b/docs/source/models/lora.rst
@@ -0,0 +1,52 @@
+.. _lora:
+
+Using LoRA adapters
+===================
+
+This document shows you how to use `LoRA adapters <https://arxiv.org/abs/2106.09685>`_ with vLLM on top of a base model.
+Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
+them locally with
+
+.. code-block:: python
+
+    from huggingface_hub import snapshot_download
+
+    sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+
+
+Then we instantiate the base model and pass in the ``enable_lora=True`` flag:
+
+.. code-block:: python
+
+    from vllm import LLM, SamplingParams
+    from vllm.lora.request import LoRARequest
+
+    llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True)
+
+
+We can now submit the prompts and call ``llm.generate`` with the ``lora_request`` parameter. The first parameter
+of ``LoRARequest`` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
+the third parameter is the path to the LoRA adapter.
+
+.. code-block:: python
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=256,
+        stop=["[/assistant]"]
+    )
+
+    prompts = [
+         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
+         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
+    ]
+
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
+    )
+
+
+Check out `examples/multilora_inference.py <https://github.com/vllm-project/vllm/blob/main/examples/multilora_inference.py>`_
+for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
\ No newline at end of file

From 563836496abc0914c212b693130f80be25926564 Mon Sep 17 00:00:00 2001
From: Rex <zcnrex@gmail.com>
Date: Mon, 12 Feb 2024 11:02:17 -0800
Subject: [PATCH 053/112] Refactor 2 awq gemm kernels into m16nXk32 (#2723)

Co-authored-by: Chunan Zeng <chunanzeng@Chunans-Air.attlocal.net>
---
 csrc/quantization/awq/gemm_kernels.cu         | 366 ++++--------------
 .../model_executor/layers/quantization/awq.py |   2 +-
 2 files changed, 73 insertions(+), 295 deletions(-)

diff --git a/csrc/quantization/awq/gemm_kernels.cu b/csrc/quantization/awq/gemm_kernels.cu
index 376c8ebfb9b7a..5aefb0bd16aef 100644
--- a/csrc/quantization/awq/gemm_kernels.cu
+++ b/csrc/quantization/awq/gemm_kernels.cu
@@ -27,72 +27,85 @@ __pack_half2(const half x, const half y) {
   return (v1 << 16) | v0;
 }
 
-__global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, int split_k_iters, half* __restrict__ A, int* __restrict__ B, half* __restrict__ scaling_factors, int* __restrict__ zeros, int M, int IC, int OC, half* __restrict__ C) 
+template<int N>
+__global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16nXk32(
+  int G,
+  int split_k_iters,
+  half* __restrict__ A,
+  int* __restrict__ B,
+  half* __restrict__ scaling_factors,
+  int* __restrict__ zeros,
+  int M,
+  int IC,
+  int OC,
+  half* __restrict__ C)
 {
+  // Only support matrix n = 64 or 128
+  assert(N == 64 || N == 128);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
   assert(false);
 #else
   static constexpr uint32_t ZERO = 0x0;
   float C_warp[32];
   __shared__ half A_shared[16 * (32 + 8)];
-  __shared__ half B_shared[32 * (128 + 8)];
-  
-  __shared__ half scaling_factors_shared[128];
-  __shared__ half zeros_shared[128];
+  __shared__ half B_shared[32 * (N + 8)];
 
-  int j_factors1 = ((OC + 128 - 1) / 128);
+  __shared__ half scaling_factors_shared[N];
+  __shared__ half zeros_shared[N];
+
+  int j_factors1 = ((OC + N - 1) / N);
   int blockIdx_x = 0;
   int blockIdx_y = blockIdx.x % ((M + 16 - 1) / 16 * j_factors1);
   int blockIdx_z = blockIdx.x / ((M + 16 - 1) / 16 * j_factors1);
 
   half A_shared_warp[8];
-  half B_shared_warp[32];
-  for (int j_0_4_init = 0; j_0_4_init < 4; ++j_0_4_init) {
+  half B_shared_warp[N / 4];
+  for (int j_0_4_init = 0; j_0_4_init < N / 32; ++j_0_4_init) {
     for (int i = 0; i < 8; ++i) {
       C_warp[(j_0_4_init * 8) + i] = 0.0;
     }
   }
 
   static constexpr int row_stride_warp = 32 * 8 / 32;
-  static constexpr int row_stride = 2 * 32 * 8 / 128;
-  bool ld_zero_flag = (threadIdx.y * 32 + threadIdx.x) * 8 < 128;
+  static constexpr int row_stride = 2 * 32 * 8 / N;
+  bool ld_zero_flag = (threadIdx.y * 32 + threadIdx.x) * 8 < N;
   // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
   bool ld_A_flag = (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp + threadIdx.x * 8 / 32) < M;     // threadIdx.y is warp_id
   // bool wb_C_flag = (threadIdx.x / 4) < M;
 
-  half* A_ptr = A 
+  half* A_ptr = A
                 + (((int)blockIdx_y) / j_factors1 * 16 + (((int)threadIdx.y) * row_stride_warp) + ((int)threadIdx.x) / (32 / 8)) * IC
                 + (((int)threadIdx.x) % (32 / 8)) * 8;
-  
+
   int* B_ptr = B
-            + ((int)threadIdx.y) * (OC / 8) * 2
-            + (((int)threadIdx.x) / (128 / 8)) * (OC / 8)
-            + (((int)blockIdx_y) % j_factors1) * (128 / 8)
-            + (((int)threadIdx.x) % (128 / 8)) * 1;
+            + ((int)threadIdx.y) * (OC / 8) * (256 / N)
+            + (((int)threadIdx.x) / (N / 8)) * (OC / 8)
+            + (((int)blockIdx_y) % j_factors1) * (N / 8)
+            + (((int)threadIdx.x) % (N / 8)) * 1;
 // Why * 1 in the above line?
-                        
-  half* A_shared_ptr = A_shared 
-                    + ((int)threadIdx.y) * row_stride_warp * (32 + 8) 
+
+  half* A_shared_ptr = A_shared
+                    + ((int)threadIdx.y) * row_stride_warp * (32 + 8)
                     + (((int)threadIdx.x) / (32 / 8)) * (32 + 8)
                     + (((int)threadIdx.x) % (32 / 8) ) * 8;
 
   half* B_shared_ptr = B_shared
-                    + ((int)threadIdx.y) * (row_stride / 2) * (128 + 8)
-                    + (((int)threadIdx.x) / (128 / 8)) * (128 + 8)
-                    + (((int)threadIdx.x) % (128 / 8)) * 8;
-  
+                    + ((int)threadIdx.y) * (row_stride / 2) * (N + 8)
+                    + (((int)threadIdx.x) / (N / 8)) * (N + 8)
+                    + (((int)threadIdx.x) % (N / 8)) * 8;
+
   int* zeros_ptr = zeros
-                + (((int)blockIdx_y) % j_factors1) * (128 / 8)
-                + ((int)threadIdx.x) % (128 / 8);
-  
+                + (((int)blockIdx_y) % j_factors1) * (N / 8)
+                + ((int)threadIdx.x) % (N / 8);
+
   half* scaling_factors_ptr = scaling_factors
-                            + (((int)blockIdx_y) % j_factors1) * (128) 
-                            + (((int)threadIdx.x) % (128 / 8)) * 8;
+                            + (((int)blockIdx_y) % j_factors1) * N
+                            + (((int)threadIdx.x) % (N / 8)) * 8;
 
-  half* C_ptr = C 
+  half* C_ptr = C
               + static_cast<long long>(blockIdx_z) * M * OC        // blockIdz.x -> split_k dim
-              + (((int)blockIdx_y) % j_factors1) * 128
-              + ((int)threadIdx.y) * 64
+              + (((int)blockIdx_y) % j_factors1) * N
+              + ((int)threadIdx.y) * (N / 2)
               + (((int)threadIdx.x) % 4) * 2;
 
   // preload s.f. and zeros
@@ -123,13 +136,13 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
     // uint4 B_loaded_scale = make_uint4(0, 0, 0, 0);
     int* B_ptr_local = B_ptr + k_0_0 * 32 * (OC / 8);
 
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < N / 16; ++ax0_ax1_fused_0) {
 
       // B: 32 x 136 (128+8) float16
       // each warp: 32 x 4
       // each thr: read 32 bit -> convert to 8xFP16 (a UINT4) -> scale and minus zero -> WB UINT4
       // *(uint4*)(B_shared + ((((ax0_ax1_fused_0 * 544) + (((int)threadIdx.y) * 272)) + ((((int)threadIdx.x) >> 4) * 136)) + ((((int)threadIdx.x) & 15) * 8))) = *(uint4*)(B + ((((((k_0_0 * 163840) + (ax0_ax1_fused_0 * 20480)) + (((int)threadIdx.y) * 10240)) + ((((int)threadIdx.x) >> 4) * 5120)) + (((int)blockIdx_y) * 128)) + ((((int)threadIdx.x) & 15) * 8)));
-      // row stride in shared memory: (NWARPS * 32 * 8 / cta_N) 
+      // row stride in shared memory: (NWARPS * 32 * 8 / cta_N)
       uint32_t B_loaded = *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8));
       uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
       //uint4 B_loaded_zero = *(uint4*)(zeros_shared + (threadIdx.x % (cta_N / 8)) * 8);
@@ -152,7 +165,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
       */
 
       // write back
-      *(uint4*)(B_shared_ptr + ax0_ax1_fused_0 * row_stride * (128 + 8)) = B_loaded_fp16;
+      *(uint4*)(B_shared_ptr + ax0_ax1_fused_0 * row_stride * (N + 8)) = B_loaded_fp16;
     }
     __syncthreads();
 
@@ -174,13 +187,13 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
         );
       }
 
-      for (int ax1_0 = 0; ax1_0 < 4; ++ax1_0) {
+      for (int ax1_0 = 0; ax1_0 < N / 32; ++ax1_0) {
         {
           unsigned int addr;
           __asm__ __volatile__(
             "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
             : "=r"(addr)
-            : "l"((void *)((&(B_shared[(((k_0_1 * 2176) + (((int)threadIdx.y) * 64)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 136) + ((((int)threadIdx.x) >> 4) * 8))))
+            : "l"((void *)((&(B_shared[(((k_0_1 * (N * 16 + 128)) + (((int)threadIdx.y) * (N / 2))) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * (N + 8)) + ((((int)threadIdx.x) >> 4) * 8))))
           );
           __asm__ __volatile__(
             "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
@@ -190,7 +203,7 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
           );
         }
       }
-      for (int j_0_4 = 0; j_0_4 < 4; ++j_0_4) {
+      for (int j_0_4 = 0; j_0_4 < N / 32; ++j_0_4) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
         {
           __asm__ __volatile__(
@@ -258,241 +271,6 @@ __global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, i
 #endif
 }
 
-
-__global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, int split_k_iters, half* __restrict__ A, int* __restrict__ B, half* __restrict__ scaling_factors, int* __restrict__ zeros, int M, int IC, int OC, half* __restrict__ C) 
-{
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
-  assert(false);
-#else
-  static constexpr uint32_t ZERO = 0x0;
-  float C_warp[32];
-  __shared__ half A_shared[16 * (32 + 8)];
-  __shared__ half B_shared[32 * (64 + 8)];
-  
-  __shared__ half scaling_factors_shared[64];
-  __shared__ half zeros_shared[64];
-
-  int j_factors1 = ((OC + 64 - 1) / 64);
-
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((M + 16 - 1) / 16 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((M + 16 - 1) / 16 * j_factors1);
-
-  half A_shared_warp[8];
-  half B_shared_warp[16];
-  for (int j_0_4_init = 0; j_0_4_init < 2; ++j_0_4_init) {
-    for (int i = 0; i < 8; ++i) {
-      C_warp[(j_0_4_init * 8) + i] = 0.0;
-    }
-  }
-
-  static constexpr int row_stride_warp = 32 * 8 / 32;
-  static constexpr int row_stride = 2 * 32 * 8 / 64;
-  bool ld_zero_flag = (threadIdx.y * 32 + threadIdx.x) * 8 < 64;
-  // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
-  bool ld_A_flag = (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp + threadIdx.x * 8 / 32) < M;     // threadIdx.y is warp_id
-  // bool wb_C_flag = (threadIdx.x / 4) < M;
-
-  half* A_ptr = A 
-                + (((int)blockIdx_y) / j_factors1 * 16 + (((int)threadIdx.y) * row_stride_warp) + ((int)threadIdx.x) / (32 / 8)) * IC
-                + (((int)threadIdx.x) % (32 / 8)) * 8;
-  
-  int* B_ptr = B
-            + ((int)threadIdx.y) * (OC / 8) * 4
-            + (((int)threadIdx.x) / (64 / 8)) * (OC / 8)
-            + (((int)blockIdx_y) % j_factors1) * (64 / 8)
-            + (((int)threadIdx.x) % (64 / 8)) * 1;
-// Why * 1 in the above line?
-                        
-  half* A_shared_ptr = A_shared 
-                    + ((int)threadIdx.y) * row_stride_warp * (32 + 8) 
-                    + (((int)threadIdx.x) / (32 / 8)) * (32 + 8)
-                    + (((int)threadIdx.x) % (32 / 8) ) * 8;
-
-  half* B_shared_ptr = B_shared
-                    + ((int)threadIdx.y) * (row_stride / 2) * (64 + 8)
-                    + (((int)threadIdx.x) / (64 / 8)) * (64 + 8)
-                    + (((int)threadIdx.x) % (64 / 8)) * 8;
-  
-  int* zeros_ptr = zeros
-                + (((int)blockIdx_y) % j_factors1) * (64 / 8)
-                + ((int)threadIdx.x) % (64 / 8);
-  
-  half* scaling_factors_ptr = scaling_factors
-                            + (((int)blockIdx_y) % j_factors1) * (64) 
-                            + (((int)threadIdx.x) % (64 / 8)) * 8;
-
-  half* C_ptr = C 
-              + static_cast<long long>(blockIdx_z) * M * OC        // blockIdz.x -> split_k dim
-              + (((int)blockIdx_y) % j_factors1) * 64
-              + ((int)threadIdx.y) * 32
-              + (((int)threadIdx.x) % 4) * 2;
-
-  // preload s.f. and zeros
-  int k_bound = (IC / 32 + split_k_iters - 1) / split_k_iters;
-  if ((k_bound - 1) * split_k_iters * 32 + blockIdx_z * 32 >= IC) k_bound -= 1;
-  for (int _k_0_0 = 0; _k_0_0 < k_bound; ++_k_0_0) {
-    int k_0_0 = _k_0_0 * split_k_iters + blockIdx_z;
-    __syncthreads();
-    // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
-    if (ld_A_flag)
-    {
-      *(uint4*)(A_shared_ptr) = *(uint4*)(A_ptr + (k_0_0 * 32));
-    }
-    else
-    {
-      *(uint4*)(A_shared_ptr) = make_uint4(0, 0, 0, 0);
-    }
-
-    // for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 2; ++ax0_ax1_fused_0) {
-    uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr + k_0_0 * 32 / G * (OC / 8));
-    uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
-    uint4 B_loaded_scale = *(uint4*)(scaling_factors_ptr + k_0_0 * 32 / G * (OC));
-    /*
-    if (blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 0 && threadIdx.y == 0){
-      printf("%x %x %x %x %x %x %x %x\n", B_loaded_scale.x, B_loaded_scale.y, B_loaded_scale.z, B_loaded_scale.w, B_loaded_zero.x, B_loaded_zero.y, B_loaded_zero.z, B_loaded_zero.w);
-    }
-    */
-    // uint4 B_loaded_scale = make_uint4(0, 0, 0, 0);
-    int* B_ptr_local = B_ptr + k_0_0 * 32 * (OC / 8);
-
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
-
-      // B: 32 x 136 (128+8) float16
-      // each warp: 32 x 4
-      // each thr: read 32 bit -> convert to 8xFP16 (a UINT4) -> scale and minus zero -> WB UINT4
-      // *(uint4*)(B_shared + ((((ax0_ax1_fused_0 * 544) + (((int)threadIdx.y) * 272)) + ((((int)threadIdx.x) >> 4) * 136)) + ((((int)threadIdx.x) & 15) * 8))) = *(uint4*)(B + ((((((k_0_0 * 163840) + (ax0_ax1_fused_0 * 20480)) + (((int)threadIdx.y) * 10240)) + ((((int)threadIdx.x) >> 4) * 5120)) + (((int)blockIdx_y) * 128)) + ((((int)threadIdx.x) & 15) * 8)));
-      // row stride in shared memory: (NWARPS * 32 * 8 / cta_N) 
-      uint32_t B_loaded = *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8));
-      uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
-      //uint4 B_loaded_zero = *(uint4*)(zeros_shared + (threadIdx.x % (cta_N / 8)) * 8);
-
-      // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x % (cta_N / 8)) * 8);
-      // - zero and * scale
-      // TODO (Haotian): can save 4 assembly instructions if sormulate as deq = q * scale - zero * scale.
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
-      /*
-      if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 17 && threadIdx.y == 0){
-        printf("[x] %X %X %X %X\n", B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w);
-      }
-      */
-
-      // write back
-      *(uint4*)(B_shared_ptr + ax0_ax1_fused_0 * row_stride * (64 + 8)) = B_loaded_fp16;
-    }
-    __syncthreads();
-
-    for (int k_0_1 = 0; k_0_1 < 2; ++k_0_1) 
-    {
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-          "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
-          : "=r"(addr)
-          : "l"((void *)((&(A_shared[(k_0_1 * 16)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8))))
-        );
-        __asm__ __volatile__(
-          "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-          "{%0, %1, %2, %3}, [%4];\n"
-          : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
-          : "r"(addr)
-        );
-      }
-        
-
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) 
-      {
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
-            : "=r"(addr)
-            : "l"((void *)((&(B_shared[(((k_0_1 * 1152) + (((int)threadIdx.y) * 32)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8))))
-          );
-          __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];\n"
-            : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
-            : "r"(addr)
-          );
-        }
-      }
-      
-      for (int j_0_4 = 0; j_0_4 < 2; ++j_0_4) 
-      {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
-        {
-          __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
-            :  "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
-        }
-
-        {
-          __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
-            :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
-        }
-
-        {
-          __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
-            :  "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
-        }
-
-        {
-          __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
-            :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
-        }
-#else
-        {
-          __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
-            :  "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3]));
-        }
-
-        {
-          __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
-            :  "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]));
-        }
-#endif
-      }
-    }
-  }
-
-// TODO: Shang: Hoist loop invariance.
-  for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
-    for (int local_id = 0; local_id < 8; ++local_id) {
-      int row_offset = (((int)blockIdx_y) / j_factors1) * 16 + ((int)threadIdx.x) / 4 + (local_id % 4) / 2 * 8;
-      if (row_offset < M)
-      {
-        *(C_ptr + ax1_0_1 * 16 + row_offset * OC + (local_id / 4) * 8 + local_id % 2) = __float2half(C_warp[(ax1_0_1 * 8) + local_id]);
-      }
-    }
-  }
-#endif
-}
-
 __global__ void __launch_bounds__(64) dequantize_weights(
     int* __restrict__ B,
     half* __restrict__ scaling_factors,
@@ -526,26 +304,24 @@ __global__ void __launch_bounds__(64) dequantize_weights(
   int index4 = 8 * col + (int)(row / G) * N * 8;
   half* scaling_factors_ptr2 = scaling_factors + index4;
 
+  uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr2);
+  uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
+  uint4 B_loaded_scale = *(uint4*)(scaling_factors_ptr2);
 
-    uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr2);
-    uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
-    uint4 B_loaded_scale = *(uint4*)(scaling_factors_ptr2);
-int j=0;
+  uint32_t B_loaded = *(uint32_t*)B_ptr2;
+  uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+  asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
+  asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
+  asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
+  asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
 
-      uint32_t B_loaded = *(uint32_t*)(B_ptr2 + j);
-      uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO));
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO));
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO));
-      asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w));
-      asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
-
-      *(uint4*)(B_shared_ptr2 + j) = B_loaded_fp16;
+  *(uint4*)B_shared_ptr2 = B_loaded_fp16;
 
-  for (int i=0; i<8; ++i) {
+  for (int i = 0; i < 8; ++i) {
     *(C_ptr2 + i) = B_shared[i];
   }
 }
@@ -650,19 +426,21 @@ torch::Tensor awq_gemm(
         // threadIdx.x: 32
         // threadIdx.y: i_factors[2] * j_factors[2]
         dim3 threads_per_block(32, 2);
-        vllm::awq::gemm_forward_4bit_cuda_m16n128k32<<<num_blocks, threads_per_block, 0, stream>>>(
-            group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, num_in_feats, num_in_channels, num_out_channels, out_feats);
+        vllm::awq::gemm_forward_4bit_cuda_m16nXk32<128><<<num_blocks, threads_per_block, 0, stream>>>(
+            group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, num_in_feats, num_in_channels,
+            num_out_channels, out_feats);
     }
     else if (num_out_channels % 64 == 0)
     {
         int j_factors1 = num_out_channels / 64 / 1;
         dim3 num_blocks(1 * (num_out_feats + 16 - 1) / 16 * j_factors1 * split_k_iters);
-    
+
         // threadIdx.x: 32
         // threadIdx.y: i_factors[2] * j_factors[2]
         dim3 threads_per_block(32, 2);
-        vllm::awq::gemm_forward_4bit_cuda_m16n64k32<<<num_blocks, threads_per_block, 0, stream>>>(
-            group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, num_in_feats, num_in_channels, num_out_channels, out_feats);
+        vllm::awq::gemm_forward_4bit_cuda_m16nXk32<64><<<num_blocks, threads_per_block, 0, stream>>>(
+            group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, num_in_feats, num_in_channels,
+            num_out_channels, out_feats);
     }
     return _out_feats.sum(0);
 }
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 681f95821eabb..3e1c814dd233c 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -145,8 +145,8 @@ def apply_weights(self,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         qweight = weights["qweight"]
-        qzeros = weights["qzeros"]
         scales = weights["scales"]
+        qzeros = weights["qzeros"]
         pack_factor = self.quant_config.pack_factor
         out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, ))
         reshaped_x = x.reshape(-1, x.shape[-1])

From a4211a4dc3a83d9e58eb7ee2f015aa033159c267 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 12 Feb 2024 22:53:00 -0800
Subject: [PATCH 054/112] Serving Benchmark Refactoring (#2433)

---
 .buildkite/run-benchmarks.sh       |  14 +-
 benchmarks/backend_request_func.py | 284 ++++++++++++++++++++++
 benchmarks/benchmark_serving.py    | 378 ++++++++++++++++++++---------
 benchmarks/launch_tgi_server.sh    |   2 +-
 4 files changed, 553 insertions(+), 125 deletions(-)
 create mode 100644 benchmarks/backend_request_func.py

diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
index dde28cb55605e..865068628f1d0 100644
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -6,15 +6,16 @@ set -o pipefail
 # cd into parent directory of this file
 cd "$(dirname "${BASH_SOURCE[0]}")/.."
 
-(wget && curl) || (apt-get update && apt-get install -y wget curl)
+(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 
-# run benchmarks and upload the result to buildkite
+# run python-based benchmarks and upload the result to buildkite
 python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
 bench_latency_exit_code=$?
 
 python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
 bench_throughput_exit_code=$?
 
+# run server-based benchmarks and upload the result to buildkite
 python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
 server_pid=$!
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
@@ -22,11 +23,14 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 # wait for server to start, timeout after 600 seconds
 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
 python3 benchmarks/benchmark_serving.py \
+    --backend openai \
     --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
     --model meta-llama/Llama-2-7b-chat-hf \
     --num-prompts 20 \
     --endpoint /v1/completions \
-    --tokenizer meta-llama/Llama-2-7b-chat-hf 2>&1 | tee benchmark_serving.txt
+    --tokenizer meta-llama/Llama-2-7b-chat-hf \
+    --save-result \
+    2>&1 | tee benchmark_serving.txt
 bench_serving_exit_code=$?
 kill $server_pid
 
@@ -44,7 +48,7 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
 echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
-tail -n 5 benchmark_serving.txt >> benchmark_results.md # last 5 lines
+tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines
 
 # upload the results to buildkite
 /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
@@ -61,3 +65,5 @@ fi
 if [ $bench_serving_exit_code -ne 0 ]; then
     exit $bench_serving_exit_code
 fi
+
+/workspace/buildkite-agent artifact upload openai-*.json
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
new file mode 100644
index 0000000000000..e7f74e2feaf86
--- /dev/null
+++ b/benchmarks/backend_request_func.py
@@ -0,0 +1,284 @@
+import json
+import os
+import time
+from dataclasses import dataclass
+from typing import Optional
+
+import aiohttp
+from tqdm.asyncio import tqdm
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+
+@dataclass
+class RequestFuncInput:
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    best_of: int = 1
+    use_beam_search: bool = False
+
+
+@dataclass
+class RequestFuncOutput:
+    generated_text: str = ""
+    success: bool = False
+    latency: float = 0
+    ttft: float = 0
+    prompt_len: int = 0
+
+
+async def async_request_tgi(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        params = {
+            "best_of": request_func_input.best_of,
+            "max_new_tokens": request_func_input.output_len,
+            "do_sample": True,
+            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
+            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+        }
+        payload = {
+            "inputs": request_func_input.prompt,
+            "parameters": params,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0
+        st = time.perf_counter()
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for data in response.content.iter_any():
+                        if ttft == 0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+                    output.latency = time.perf_counter() - st
+
+                    body = data.decode("utf-8").lstrip("data:")
+                    output.generated_text = json.loads(body)["generated_text"]
+                    output.success = True
+                else:
+                    output.success = False
+        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+            output.success = False
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_vllm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate")
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "prompt": request_func_input.prompt,
+            "n": 1,
+            "best_of": request_func_input.best_of,
+            "use_beam_search": request_func_input.use_beam_search,
+            "temperature": 0.0 if request_func_input.use_beam_search else 1.0,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "ignore_eos": True,
+            "stream": True,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0
+        st = time.perf_counter()
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for data in response.content.iter_any():
+                        if ttft == 0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+                    output.latency = time.perf_counter() - st
+
+                    # When streaming, '\0' is appended to the end of the response.
+                    body = data.decode("utf-8").strip("\0")
+                    output.generated_text = json.loads(
+                        body)["text"][0][len(request_func_input.prompt):]
+                    output.success = True
+
+                else:
+                    output.success = False
+        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+            output.success = False
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        assert request_func_input.best_of == 1
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        ttft = 0
+
+        st = time.perf_counter()
+        try:
+            async with session.post(url=api_url, json=payload) as resp:
+                if resp.status == 200:
+                    async for data in resp.content.iter_any():
+                        if ttft == 0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+                    output.latency = time.perf_counter() - st
+
+                    body = data.decode("utf-8").lstrip("data:")
+                    output.generated_text = json.loads(body)["text_output"]
+                    output.success = True
+
+                else:
+                    output.success = False
+        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+            output.success = False
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_deepspeed_mii(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert request_func_input.best_of == 1
+        assert not request_func_input.use_beam_search
+
+        payload = {
+            "prompts": request_func_input.prompt,
+            "max_new_tokens": request_func_input.output_len,
+            "ignore_eos": True,
+            "do_sample": True,
+            "temperature":
+            0.01,  # deepspeed-mii does not accept 0.0 temperature.
+            "top_p": 1.0,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        # DeepSpeed-MII doesn't support streaming as of Jan 28 2024, will use 0 as placeholder.
+        # https://github.com/microsoft/DeepSpeed-MII/pull/311
+        output.ttft = 0
+
+        st = time.perf_counter()
+        try:
+            async with session.post(url=request_func_input.api_url,
+                                    json=payload) as resp:
+                if resp.status == 200:
+                    parsed_resp = await resp.json()
+                    output.latency = time.perf_counter() - st
+                    output.generated_text = parsed_resp[0]["generated_text"]
+                    output.success = True
+                else:
+                    output.success = False
+        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+            output.success = False
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("v1/completions")
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        payload = {
+            "model": request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "temperature": 0.0,
+            "best_of": request_func_input.best_of,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0
+        st = time.perf_counter()
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk in response.content:
+                        if ttft == 0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        chunk = chunk.strip()
+                        if not chunk:
+                            continue
+
+                        chunk = chunk.decode("utf-8").lstrip("data: ")
+                        if chunk == "[DONE]":
+                            latency = time.perf_counter() - st
+                        else:
+                            body = json.loads(chunk)
+                            generated_text += body["choices"][0]["text"]
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                else:
+                    output.success = False
+        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+            output.success = False
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+ASYNC_REQUEST_FUNCS = {
+    "tgi": async_request_tgi,
+    "vllm": async_request_vllm,
+    "deepspeed-mii": async_request_deepspeed_mii,
+    "openai": async_request_openai_completions,
+    "tensorrt-llm": async_request_trt_llm,
+}
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 1a36d9d6a5deb..cdcfb8582143c 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -20,16 +20,36 @@
 import json
 import random
 import time
+from dataclasses import dataclass
+from datetime import datetime
 from typing import AsyncGenerator, List, Tuple
 
-import aiohttp
 import numpy as np
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-# (prompt len, output len, latency)
-REQUEST_LATENCY: List[Tuple[int, int, float]] = []
+from backend_request_func import (
+    ASYNC_REQUEST_FUNCS,
+    RequestFuncInput,
+    RequestFuncOutput,
+)
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    request_throughput: float
+    input_throughput: float
+    output_throughput: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    p99_ttft_ms: float
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    p99_tpot_ms: float
 
 
 def sample_requests(
@@ -46,6 +66,11 @@ def sample_requests(
     dataset = [(data["conversations"][0]["value"],
                 data["conversations"][1]["value"]) for data in dataset]
 
+    # some of these will be filtered out, so sample more than we need
+    sampled_indices = random.sample(range(len(dataset)),
+                                    int(num_requests * 1.2))
+    dataset = [dataset[i] for i in sampled_indices]
+
     # Tokenize the prompts and completions.
     prompts = [prompt for prompt, _ in dataset]
     prompt_token_ids = tokenizer(prompts).input_ids
@@ -92,80 +117,125 @@ async def get_request(
         await asyncio.sleep(interval)
 
 
-async def send_request(backend: str, model: str, api_url: str, prompt: str,
-                       prompt_len: int, output_len: int, best_of: int,
-                       use_beam_search: bool, pbar: tqdm) -> None:
-    request_start_time = time.perf_counter()
-
-    headers = {"User-Agent": "Benchmark Client"}
-    if backend == "vllm":
-        pload = {
-            "prompt": prompt,
-            "n": 1,
-            "best_of": best_of,
-            "use_beam_search": use_beam_search,
-            "temperature": 0.0 if use_beam_search else 1.0,
-            "top_p": 1.0,
-            "max_tokens": output_len,
-            "ignore_eos": True,
-            "stream": False,
-        }
-        if model is not None:
-            pload["model"] = model
-    elif backend == "tgi":
-        assert not use_beam_search
-        params = {
-            "best_of": best_of,
-            "max_new_tokens": output_len,
-            "do_sample": True,
-        }
-        pload = {
-            "inputs": prompt,
-            "parameters": params,
-        }
-    else:
-        raise ValueError(f"Unknown backend: {backend}")
-
-    timeout = aiohttp.ClientTimeout(total=3 * 3600)
-    async with aiohttp.ClientSession(timeout=timeout) as session:
-        while True:
-            async with session.post(api_url, headers=headers,
-                                    json=pload) as response:
-                chunks = []
-                async for chunk, _ in response.content.iter_chunks():
-                    chunks.append(chunk)
-            output = b"".join(chunks).decode("utf-8")
-            output = json.loads(output)
+def calculate_metrics(
+    input_requests: List[Tuple[str, int, int]],
+    outputs: List[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+) -> BenchmarkMetrics:
+    total_output = 0
+    total_input = 0
+    completed = 0
+    per_token_latencies = []
+    ttfts = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            output_len = len(tokenizer.encode(outputs[i].generated_text))
+            total_output += output_len
+            total_input += input_requests[i][1]
+            per_token_latencies.append(outputs[i].latency / output_len)
+            ttfts.append(outputs[i].ttft)
+            completed += 1
 
-            # Re-send the request if it failed.
-            if "error" not in output:
-                break
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=total_output,
+        request_throughput=completed / dur_s,
+        input_throughput=total_input / dur_s,
+        output_throughput=total_output / dur_s,
+        mean_ttft_ms=np.mean(ttfts) * 1000,
+        median_ttft_ms=np.median(ttfts) * 1000,
+        p99_ttft_ms=np.percentile(ttfts, 99) * 1000,
+        mean_tpot_ms=np.mean(per_token_latencies) * 1000,
+        median_tpot_ms=np.median(per_token_latencies) * 1000,
+        p99_tpot_ms=np.percentile(per_token_latencies, 99) * 1000,
+    )
 
-    request_end_time = time.perf_counter()
-    request_latency = request_end_time - request_start_time
-    REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
-    pbar.update(1)
+    return metrics
 
 
 async def benchmark(
     backend: str,
-    model: str,
     api_url: str,
+    model_id: str,
+    tokenizer: PreTrainedTokenizerBase,
     input_requests: List[Tuple[str, int, int]],
     best_of: int,
     use_beam_search: bool,
     request_rate: float,
-) -> None:
-    tasks: List[asyncio.Task] = []
-    pbar = tqdm(total=len(input_requests))
+    disable_tqdm: bool,
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS.get(backend)
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    print(f"Traffic request rate: {request_rate}")
+
+    benchmark_start_time = time.perf_counter()
+    tasks = []
     async for request in get_request(input_requests, request_rate):
         prompt, prompt_len, output_len = request
-        task = asyncio.create_task(
-            send_request(backend, model, api_url, prompt, prompt_len,
-                         output_len, best_of, use_beam_search, pbar))
-        tasks.append(task)
-    await asyncio.gather(*tasks)
-    pbar.close()
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompt=prompt,
+            api_url=api_url,
+            prompt_len=prompt_len,
+            output_len=output_len,
+            best_of=best_of,
+            use_beam_search=use_beam_search,
+        )
+        tasks.append(
+            asyncio.create_task(
+                request_func(request_func_input=request_func_input,
+                             pbar=pbar)))
+    outputs = await asyncio.gather(*tasks)
+
+    if not disable_tqdm:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    metrics = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+    )
+
+    print(f"Successful requests: {metrics.completed}")
+    print(f"Benchmark duration: {benchmark_duration:2f} s")
+    print(f"Total input tokens: {metrics.total_input}")
+    print(f"Total generated tokens: {metrics.total_output}")
+    print(f"Request throughput: {metrics.request_throughput:.2f} requests/s")
+    print(f"Input token throughput: {metrics.input_throughput:.2f} tokens/s")
+    print(f"Output token throughput: {metrics.output_throughput:.2f} tokens/s")
+    print(f"Mean TTFT: {metrics.mean_ttft_ms:.2f} ms")
+    print(f"Median TTFT: {metrics.median_ttft_ms:.2f} ms")
+    print(f"P99 TTFT: {metrics.p99_ttft_ms:.2f} ms")
+    print(f"Mean TPOT: {metrics.mean_tpot_ms:.2f} ms")
+    print(f"Median TPOT: {metrics.median_tpot_ms:.2f} ms")
+    print(f"P99 TPOT: {metrics.p99_tpot_ms:.2f} ms")
+
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "request_inthroughput": metrics.request_throughput,
+        "input_throughput": metrics.input_throughput,
+        "output_throughput": metrics.output_throughput,
+        "mean_ttft_ms": metrics.mean_ttft_ms,
+        "median_ttft_ms": metrics.median_ttft_ms,
+        "p99_ttft_ms": metrics.p99_ttft_ms,
+        "mean_tpot_ms": metrics.mean_tpot_ms,
+        "median_tpot_ms": metrics.median_tpot_ms,
+        "p99_tpot_ms": metrics.p99_tpot_ms
+    }
+    return result
 
 
 def main(args: argparse.Namespace):
@@ -173,77 +243,145 @@ def main(args: argparse.Namespace):
     random.seed(args.seed)
     np.random.seed(args.seed)
 
-    api_url = f"{args.protocol}://{args.host}:{args.port}{args.endpoint}"
-    tokenizer = get_tokenizer(args.tokenizer,
+    backend = args.backend
+    model_id = args.model
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+
+    tokenizer = get_tokenizer(tokenizer_id,
                               trust_remote_code=args.trust_remote_code)
     input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
 
-    benchmark_start_time = time.perf_counter()
-    asyncio.run(
-        benchmark(args.backend, args.model, api_url, input_requests,
-                  args.best_of, args.use_beam_search, args.request_rate))
-    benchmark_end_time = time.perf_counter()
-    benchmark_time = benchmark_end_time - benchmark_start_time
-    print(f"Total time: {benchmark_time:.2f} s")
-    print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s")
-
-    # Compute the latency statistics.
-    avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY])
-    print(f"Average latency: {avg_latency:.2f} s")
-    avg_per_token_latency = np.mean([
-        latency / (prompt_len + output_len)
-        for prompt_len, output_len, latency in REQUEST_LATENCY
-    ])
-    print(f"Average latency per token: {avg_per_token_latency:.2f} s")
-    avg_per_output_token_latency = np.mean(
-        [latency / output_len for _, output_len, latency in REQUEST_LATENCY])
-    print("Average latency per output token: "
-          f"{avg_per_output_token_latency:.2f} s")
+    benchmark_result = asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            model_id=model_id,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            best_of=args.best_of,
+            use_beam_search=args.use_beam_search,
+            request_rate=args.request_rate,
+            disable_tqdm=args.disable_tqdm,
+        ))
+
+    # Save config and results to json
+    if args.save_result:
+        result_json = {}
+
+        # Setup
+        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
+        result_json["date"] = current_dt
+        result_json["backend"] = backend
+        result_json["version"] = args.version
+        result_json["model_id"] = model_id
+        result_json["tokenizer_id"] = tokenizer_id
+        result_json["best_of"] = args.best_of
+        result_json["use_beam_search"] = args.use_beam_search
+        result_json["num_prompts"] = args.num_prompts
+
+        # Traffic
+        result_json["request_rate"] = (
+            args.request_rate if args.request_rate < float("inf") else "inf")
+
+        # Merge with benchmark result
+        result_json = {**result_json, **benchmark_result}
+
+        # Save to file
+        base_model_id = model_id.split("/")[-1]
+        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        with open(file_name, "w") as outfile:
+            json.dump(result_json, outfile)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Benchmark the online serving throughput.")
-    parser.add_argument("--backend",
-                        type=str,
-                        default="vllm",
-                        choices=["vllm", "tgi"])
-    parser.add_argument("--protocol",
-                        type=str,
-                        default="http",
-                        choices=["http", "https"])
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+    )
+    parser.add_argument(
+        "--version",
+        type=str,
+        default="N/A",
+        help="Version of the serving backend/engine.",
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--endpoint", type=str, default="/generate")
-    parser.add_argument("--model", type=str, default=None)
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/generate",
+        help="API endpoint.",
+    )
     parser.add_argument("--dataset",
                         type=str,
                         required=True,
                         help="Path to the dataset.")
-    parser.add_argument("--tokenizer",
-                        type=str,
-                        required=True,
-                        help="Name or path of the tokenizer.")
-    parser.add_argument("--best-of",
-                        type=int,
-                        default=1,
-                        help="Generates `best_of` sequences per prompt and "
-                        "returns the best one.")
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help=
+        "Name or path of the tokenizer, if not using the default model tokenizer.",
+    )
+    parser.add_argument(
+        "--best-of",
+        type=int,
+        default=1,
+        help="Generates `best_of` sequences per prompt and "
+        "returns the best one.",
+    )
     parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument("--num-prompts",
-                        type=int,
-                        default=1000,
-                        help="Number of prompts to process.")
-    parser.add_argument("--request-rate",
-                        type=float,
-                        default=float("inf"),
-                        help="Number of requests per second. If this is inf, "
-                        "then all the requests are sent at time 0. "
-                        "Otherwise, we use Poisson process to synthesize "
-                        "the request arrival times.")
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process.",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process to synthesize "
+        "the request arrival times.",
+    )
     parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument('--trust-remote-code',
-                        action='store_true',
-                        help='trust remote code from huggingface')
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from huggingface",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disbale tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--save-result",
+        action="store_true",
+        help="Specify to save benchmark results to a json file",
+    )
+
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh
index bdb25b78d85b4..64d3c4f4b3889 100755
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@@ -6,7 +6,7 @@ TOKENS=$2
 
 docker run --gpus all --shm-size 1g -p $PORT:80 \
            -v $PWD/data:/data \
-           ghcr.io/huggingface/text-generation-inference:0.8 \
+           ghcr.io/huggingface/text-generation-inference:1.4.0 \
            --model-id $MODEL \
            --sharded false  \
            --max-input-length 1024 \

From f964493274c3c839b2e27453cb70f179090cd027 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 12 Feb 2024 22:53:07 -0800
Subject: [PATCH 055/112] [CI] Ensure documentation build is checked in CI
 (#2842)

---
 .buildkite/test-pipeline.yaml                  | 7 +++++++
 .buildkite/test-template.j2                    | 4 +++-
 docs/source/conf.py                            | 2 ++
 docs/source/index.rst                          | 1 +
 docs/source/quantization/fp8_e5m2_kv_cache.rst | 1 +
 5 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 65ac2f74fb8dc..2e417ef940322 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -49,3 +49,10 @@ steps:
   commands:
   - pip install aiohttp
   - bash run-benchmarks.sh
+
+- label: Documentation Build
+  working_dir: "/vllm-workspace/docs"
+  no_gpu: True
+  commands:
+  - pip install -r requirements-docs.txt
+  - SPHINXOPTS=\"-W\" make html
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index 7c709b6097fd4..7c1cf2b5a9b39 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -35,13 +35,15 @@ steps:
               - image: "{{ docker_image }}"
                 command: ["bash"]
                 args:
-                - "-c"
+                - '-c'
                 - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
+                {% if not step.no_gpu %}
                 resources:
                   requests:
                     nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
                   limits:
                     nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                {% endif %}
                 env:
                   - name: HF_TOKEN
                     valueFrom:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 44c976468ab06..adbe67b21a0c8 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -94,3 +94,5 @@ def add_line(self, line: str, source: str, *lineno: int) -> None:
 
 
 autodoc.ClassDocumenter = MockedClassDocumenter
+
+navigation_with_keys = False
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 9b53a643b8d46..32929257661ad 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -89,6 +89,7 @@ Documentation
    :caption: Quantization
 
    quantization/auto_awq
+   quantization/fp8_e5m2_kv_cache
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/source/quantization/fp8_e5m2_kv_cache.rst b/docs/source/quantization/fp8_e5m2_kv_cache.rst
index 10437260ad964..f1eeb59550952 100644
--- a/docs/source/quantization/fp8_e5m2_kv_cache.rst
+++ b/docs/source/quantization/fp8_e5m2_kv_cache.rst
@@ -9,6 +9,7 @@ The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot1
 Here is an example of how to enable this feature:
 
 .. code-block:: python
+
     from vllm import LLM, SamplingParams
     # Sample prompts.
     prompts = [

From 5c976a7e1a1bec875bf6474824b7dff39e38de18 Mon Sep 17 00:00:00 2001
From: Roy <jasonailu87@gmail.com>
Date: Tue, 13 Feb 2024 16:09:23 +0800
Subject: [PATCH 056/112] Refactor llama family models (#2637)

---
 vllm/model_executor/layers/layernorm.py     |  25 ++
 vllm/model_executor/models/__init__.py      |   9 +-
 vllm/model_executor/models/aquila.py        | 342 -------------------
 vllm/model_executor/models/baichuan.py      | 335 ++-----------------
 vllm/model_executor/models/internlm.py      | 299 -----------------
 vllm/model_executor/models/internlm2.py     | 285 ++--------------
 vllm/model_executor/models/llama.py         | 162 +++++----
 vllm/model_executor/models/mistral.py       | 352 --------------------
 vllm/model_executor/models/qwen.py          | 267 ++-------------
 vllm/model_executor/models/stablelm.py      | 283 +---------------
 vllm/model_executor/models/yi.py            | 330 ------------------
 vllm/transformers_utils/config.py           |   4 -
 vllm/transformers_utils/configs/__init__.py |   8 -
 vllm/transformers_utils/configs/aquila.py   |  69 ----
 vllm/transformers_utils/configs/baichuan.py |  62 ----
 vllm/transformers_utils/configs/qwen.py     |  60 ----
 vllm/transformers_utils/configs/yi.py       |  64 ----
 17 files changed, 236 insertions(+), 2720 deletions(-)
 delete mode 100644 vllm/model_executor/models/aquila.py
 delete mode 100644 vllm/model_executor/models/internlm.py
 delete mode 100644 vllm/model_executor/models/mistral.py
 delete mode 100644 vllm/model_executor/models/yi.py
 delete mode 100644 vllm/transformers_utils/configs/aquila.py
 delete mode 100644 vllm/transformers_utils/configs/baichuan.py
 delete mode 100644 vllm/transformers_utils/configs/qwen.py
 delete mode 100644 vllm/transformers_utils/configs/yi.py

diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index cb3cee2bad5ad..83bc189ee4d90 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -7,6 +7,31 @@
 from vllm._C import ops
 
 
+class LayerNorm(nn.LayerNorm):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__(hidden_size, eps=eps)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """normalization."""
+        if residual is not None:
+            x = x + residual
+            residual = x
+        x = super().forward(x)
+        if residual is None:
+            return x
+        else:
+            return x, residual
+
+
 class RMSNorm(nn.Module):
     """Root mean square normalization.
 
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index fb519b3c0cf92..7c1ce1bf1fc60 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -10,8 +10,8 @@
 
 # Architecture -> (module, class).
 _MODELS = {
-    "AquilaModel": ("aquila", "AquilaForCausalLM"),
-    "AquilaForCausalLM": ("aquila", "AquilaForCausalLM"),  # AquilaChat2
+    "AquilaModel": ("llama", "LlamaForCausalLM"),
+    "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
     "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),  # baichuan-7b
     "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),  # baichuan-13b
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
@@ -24,12 +24,12 @@
     "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
     "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
-    "InternLMForCausalLM": ("internlm", "InternLMForCausalLM"),
+    "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     # For decapoda-research/llama-*
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
-    "MistralForCausalLM": ("mistral", "MistralForCausalLM"),
+    "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
     "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
     # transformers's mpt class has lower case
@@ -41,7 +41,6 @@
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
     "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
-    "YiForCausalLM": ("yi", "YiForCausalLM")
 }
 
 # Models not supported by ROCm.
diff --git a/vllm/model_executor/models/aquila.py b/vllm/model_executor/models/aquila.py
deleted file mode 100644
index 2f2bd5ffb4a63..0000000000000
--- a/vllm/model_executor/models/aquila.py
+++ /dev/null
@@ -1,342 +0,0 @@
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional, Tuple
-
-import torch
-from torch import nn
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
-from vllm.sequence import SamplerOutput
-from vllm.transformers_utils.configs.aquila import AquilaConfig
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class AquilaMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           linear_method=linear_method)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class AquilaRMSNorm(nn.Module):
-
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        AquilaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1,
-                                                               keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-
-        return (self.weight * hidden_states).to(input_dtype)
-
-
-class AquilaAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        max_position_embeddings: int = 8192,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        assert self.total_num_kv_heads % tp_size == 0
-        self.num_kv_heads = self.total_num_kv_heads // tp_size
-        self.head_dim = hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=False,
-            linear_method=linear_method,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            linear_method=linear_method,
-        )
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class AquilaDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: AquilaConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = AquilaAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            max_position_embeddings=max_position_embeddings,
-            rope_scaling=rope_scaling,
-            linear_method=linear_method,
-        )
-        self.mlp = AquilaMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            linear_method=linear_method,
-        )
-        self.input_layernorm = AquilaRMSNorm(config.hidden_size,
-                                             eps=config.rms_norm_eps)
-        self.post_attention_layernorm = AquilaRMSNorm(config.hidden_size,
-                                                      eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        # Self Attention
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        return hidden_states
-
-
-class AquilaModel(nn.Module):
-
-    def __init__(
-        self,
-        config: AquilaConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            AquilaDecoderLayer(config, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-            hidden_states = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-            )
-        hidden_states = self.norm(hidden_states)
-
-        return hidden_states
-
-
-class AquilaForCausalLM(nn.Module):
-
-    def __init__(
-        self,
-        config,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.linear_method = linear_method
-        self.model = AquilaModel(config, linear_method)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata)
-        return hidden_states
-
-    def sample(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   sampling_metadata)
-        return next_tokens
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index f08c3c8d257ff..e0f826bf7e29a 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -18,305 +18,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only BaiChuan model compatible with HuggingFace weights."""
-import math
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import torch
-from torch import nn
+from transformers import PretrainedConfig
+from vllm.config import LoRAConfig
 
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.layers.linear import LinearMethodBase
+from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
-from vllm.sequence import SamplerOutput
-from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
 
-KVCache = Tuple[torch.Tensor, torch.Tensor]
 
-
-def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
-    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
-    base = torch.tensor(
-        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
-        dtype=torch.float32,
-    )
-    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
-    slopes = torch.pow(base, powers)
-
-    if closest_power_of_2 != total_num_heads:
-        extra_base = torch.tensor(
-            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
-            dtype=torch.float32,
-        )
-        num_remaining_heads = min(closest_power_of_2,
-                                  total_num_heads - closest_power_of_2)
-        extra_powers = torch.arange(start=1,
-                                    end=1 + 2 * num_remaining_heads,
-                                    step=2,
-                                    dtype=torch.int32)
-        slopes = torch.cat(
-            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
-    return slopes
-
-
-class BaiChuanMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           linear_method=linear_method)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class BaiChuanAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        position_embedding: str,
-        rope_theta: float = 10000,
-        max_position_embeddings: int = 8192,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
-        )
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = (self.total_num_heads //
-                          tensor_model_parallel_world_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.postion_embedding = position_embedding
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        # pylint: disable=invalid-name
-        self.W_pack = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_heads,
-            bias=False,
-            linear_method=linear_method,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            linear_method=linear_method,
-        )
-        # Create the alibi slopes and slice them.
-        if self.postion_embedding == "ALIBI":
-            tp_rank = get_tensor_model_parallel_rank()
-            head_start = tp_rank * self.num_heads
-            head_end = (tp_rank + 1) * self.num_heads
-            alibi_slopes = _get_alibi_slopes(self.total_num_heads)
-            alibi_slopes = alibi_slopes[head_start:head_end].tolist()
-
-            scaling = self.head_dim**-0.5
-            self.attn = PagedAttention(self.num_heads,
-                                       self.head_dim,
-                                       scaling,
-                                       alibi_slopes=alibi_slopes)
-        else:
-            self.rotary_emb = get_rope(
-                self.head_dim,
-                rotary_dim=self.head_dim,
-                max_position=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-            self.scaling = self.head_dim**-0.5
-            self.attn = PagedAttention(self.num_heads, self.head_dim,
-                                       self.scaling)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.W_pack(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        if self.postion_embedding != "ALIBI":
-            q, k = self.rotary_emb(positions, q, k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class BaiChuanDecoderLayer(nn.Module):
-
-    def __init__(self,
-                 config: BaiChuanConfig,
-                 position_embedding: str,
-                 linear_method: Optional[LinearMethodBase] = None):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = BaiChuanAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            position_embedding=position_embedding,
-            rope_theta=rope_theta,
-            max_position_embeddings=max_position_embeddings,
-            linear_method=linear_method,
-        )
-        self.mlp = BaiChuanMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            linear_method=linear_method,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-
-
-class BaiChuanModel(nn.Module):
-
-    def __init__(self,
-                 config: BaiChuanConfig,
-                 position_embedding: str,
-                 linear_method: Optional[LinearMethodBase] = None):
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            BaiChuanDecoderLayer(config, position_embedding, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                residual,
-            )
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-
-class BaiChuanBaseForCausalLM(nn.Module):
-
-    def __init__(self,
-                 config,
-                 position_embedding: str,
-                 linear_method: Optional[LinearMethodBase] = None):
-        super().__init__()
-        self.config = config
-        self.linear_method = linear_method
-        self.model = BaiChuanModel(config, position_embedding, linear_method)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata)
-        return hidden_states
-
-    def sample(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   sampling_metadata)
-        return next_tokens
+class BaiChuanBaseForCausalLM(LlamaForCausalLM):
 
     def load_weights(self,
                      model_name_or_path: str,
@@ -328,9 +42,15 @@ def load_weights(self,
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
         ]
+        param_weight_map = [
+            ("qkv_proj", "W_pack"),
+        ]
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in hf_model_weights_iterator(
                 model_name_or_path, cache_dir, load_format, revision):
+            for (param_name, weight_name) in param_weight_map:
+                name = name.replace(weight_name, param_name)
+
             if "rotary_emb.inv_freq" in name:
                 continue
             if name == "lm_head.weight":
@@ -368,19 +88,28 @@ def load_weights(self,
 class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
     """Baichuan 13B and Baichuan2 7B/13B."""
 
-    def __init__(self,
-                 config,
-                 linear_method: Optional[LinearMethodBase] = None):
-        if config.hidden_size == 4096:  # baichuan2 7b
-            super().__init__(config, "ROPE", linear_method)
-        else:  # baichuan 13b, baichuan2 13b
-            super().__init__(config, "ALIBI", linear_method)
+    def __init__(
+        self,
+        config: Optional[PretrainedConfig] = None,
+        linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        if config.hidden_size != 4096:  # baichuan 13b, baichuan2 13b
+            config.postion_embedding = "ALIBI"
+        super().__init__(config=config,
+                         linear_method=linear_method,
+                         lora_config=lora_config)
 
 
 class BaiChuanForCausalLM(BaiChuanBaseForCausalLM):
     """Baichuan 7B."""
 
-    def __init__(self,
-                 config,
-                 linear_method: Optional[LinearMethodBase] = None):
-        super().__init__(config, "ROPE", linear_method)
+    def __init__(
+        self,
+        config: Optional[PretrainedConfig] = None,
+        linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        super().__init__(config=config,
+                         linear_method=linear_method,
+                         lora_config=lora_config)
diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py
deleted file mode 100644
index 5d0b93793c89d..0000000000000
--- a/vllm/model_executor/models/internlm.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# -*- coding: utf-8 -*-
-from typing import Any, Dict, List, Optional, Tuple
-
-import torch
-from torch import nn
-from transformers import LlamaConfig
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
-from vllm.sequence import SamplerOutput
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class InternLMMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           linear_method=linear_method)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class InternLMAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        bias: bool,
-        rope_theta: float = 10000,
-        max_position_embeddings: int = 8192,
-        linear_method: Optional[LinearMethodBase] = None,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = (self.total_num_heads //
-                          tensor_model_parallel_world_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            bias=bias,
-            linear_method=linear_method,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=bias,
-            linear_method=linear_method,
-        )
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class InternLMDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: LlamaConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = InternLMAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            bias=config.bias,
-            rope_theta=rope_theta,
-            max_position_embeddings=max_position_embeddings,
-            linear_method=linear_method,
-            rope_scaling=getattr(config, "rope_scaling", None),
-        )
-        self.mlp = InternLMMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            linear_method=linear_method,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-
-
-class InternLMModel(nn.Module):
-
-    def __init__(
-        self,
-        config: LlamaConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        vocab_size = ((config.vocab_size + 63) // 64) * 64
-        self.embed_tokens = VocabParallelEmbedding(
-            vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            InternLMDecoderLayer(config, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                residual,
-            )
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-
-class InternLMForCausalLM(nn.Module):
-
-    def __init__(
-        self,
-        config,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.linear_method = linear_method
-        self.model = InternLMModel(config, linear_method)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata)
-        return hidden_states
-
-    def sample(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   sampling_metadata)
-        return next_tokens
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index ebf1d8a89a022..bffe3ae2408cc 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,276 +1,27 @@
 # -*- coding: utf-8 -*-
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Optional
 
 import torch
-from torch import nn
 from transformers import PretrainedConfig
+from vllm.config import LoRAConfig
 
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.layers.linear import LinearMethodBase
+from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
-from vllm.sequence import SamplerOutput
 
-KVCache = Tuple[torch.Tensor, torch.Tensor]
 
-
-class InternLM2MLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            linear_method=linear_method)
-        self.w2 = RowParallelLinear(intermediate_size,
-                                    hidden_size,
-                                    bias=False,
-                                    linear_method=linear_method)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.w2(x)
-        return x
-
-
-class InternLM2Attention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        if self.total_num_kv_heads >= tp_size:
-            # Number of KV heads is greater than TP size, so we partition
-            # the KV heads across multiple tensor parallel GPUs.
-            assert self.total_num_kv_heads % tp_size == 0
-        else:
-            # Number of KV heads is less than TP size, so we replicate
-            # the KV heads across multiple tensor parallel GPUs.
-            assert tp_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        self.wqkv = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=False,
-            linear_method=linear_method,
-        )
-        self.wo = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            linear_method=linear_method,
-        )
-
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.wqkv(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-        output, _ = self.wo(attn_output)
-        return output
-
-
-class InternLMDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.attention = InternLM2Attention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            linear_method=linear_method,
-        )
-        self.feed_forward = InternLM2MLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            linear_method=linear_method,
-        )
-        self.attention_norm = RMSNorm(config.hidden_size,
-                                      eps=config.rms_norm_eps)
-        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.attention_norm(hidden_states)
-        else:
-            hidden_states, residual = self.attention_norm(
-                hidden_states, residual)
-        hidden_states = self.attention(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.ffn_norm(hidden_states, residual)
-        hidden_states = self.feed_forward(hidden_states)
-        return hidden_states, residual
-
-
-class InternLM2Model(nn.Module):
+class InternLM2ForCausalLM(LlamaForCausalLM):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Optional[PretrainedConfig] = None,
         linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
     ) -> None:
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.tok_embeddings = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            InternLMDecoderLayer(config, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.tok_embeddings(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                residual,
-            )
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-
-class InternLM2ForCausalLM(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.linear_method = linear_method
-        self.model = InternLM2Model(config, linear_method)
-        self.output = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata)
-        return hidden_states
-
-    def sample(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.output.weight, hidden_states,
-                                   sampling_metadata)
-        return next_tokens
+        super().__init__(config=config,
+                         linear_method=linear_method,
+                         lora_config=lora_config)
 
     def load_weights(self,
                      model_name_or_path: str,
@@ -282,9 +33,23 @@ def load_weights(self,
             ("gate_up_proj", "w1", 0),
             ("gate_up_proj", "w3", 1),
         ]
+        param_weight_map = [
+            ("qkv_proj", "wqkv"),
+            ("o_proj", "wo"),
+            ("down_proj", "w2"),
+            ("input_layernorm", "attention_norm"),
+            ("post_attention_layernorm", "ffn_norm"),
+            ("embed_tokens", "tok_embeddings"),
+            (".self_attn.", ".attention."),
+            ("mlp", "feed_forward"),
+            ("lm_head", "output"),
+        ]
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in hf_model_weights_iterator(
                 model_name_or_path, cache_dir, load_format, revision):
+            for (param_name, weight_name) in param_weight_map:
+                name = name.replace(weight_name, param_name)
+
             if "rotary_emb.inv_freq" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
@@ -303,7 +68,7 @@ def load_weights(self,
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 param = params_dict[name]
-                if "wqkv" in name:
+                if "qkv_proj" in name:
                     config = self.config
                     kv_groups = config.num_attention_heads // config.num_key_value_heads
                     head_dim = config.hidden_size // config.num_attention_heads
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e5a1abebf1420..462469ef79d1c 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -21,8 +21,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import List, Optional, Tuple
 
+import math
 import torch
 from torch import nn
 from transformers import LlamaConfig
@@ -40,34 +41,60 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE)
 from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 from vllm.config import LoRAConfig
 
+from copy import deepcopy
+
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
 
+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
+    base = torch.tensor(
+        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+        dtype=torch.float32,
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != total_num_heads:
+        extra_base = torch.tensor(
+            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+            dtype=torch.float32,
+        )
+        num_remaining_heads = min(closest_power_of_2,
+                                  total_num_heads - closest_power_of_2)
+        extra_powers = torch.arange(start=1,
+                                    end=1 + 2 * num_remaining_heads,
+                                    step=2,
+                                    dtype=torch.int32)
+        slopes = torch.cat(
+            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
+    return slopes
+
+
 class LlamaMLP(nn.Module):
 
     def __init__(
         self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
+        config: LlamaConfig,
         linear_method: Optional[LinearMethodBase] = None,
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            config.hidden_size, [config.intermediate_size] * 2,
             bias=False,
             linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
+        self.down_proj = RowParallelLinear(config.intermediate_size,
+                                           config.hidden_size,
                                            bias=False,
                                            linear_method=linear_method)
+        hidden_act = getattr(config, "hidden_act", "silu")
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
@@ -84,21 +111,19 @@ class LlamaAttention(nn.Module):
 
     def __init__(
         self,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
+        config: LlamaConfig,
         linear_method: Optional[LinearMethodBase] = None,
     ) -> None:
         super().__init__()
-        self.hidden_size = hidden_size
+        self.hidden_size = config.hidden_size
         tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
+        self.total_num_heads = getattr(config, "num_attention_heads", None)
         assert self.total_num_heads % tp_size == 0
         self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
+
+        # defaut to mha
+        self.total_num_kv_heads = getattr(config, "num_key_value_heads",
+                                          self.total_num_heads)
         if self.total_num_kv_heads >= tp_size:
             # Number of KV heads is greater than TP size, so we partition
             # the KV heads across multiple tensor parallel GPUs.
@@ -108,39 +133,68 @@ def __init__(
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_size % self.total_num_kv_heads == 0
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
+        self.head_dim = self.hidden_size // self.total_num_heads
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.max_position_embeddings = config.max_position_embeddings
 
+        # internlm
+        bias = getattr(config, "bias", False)
+
+        # stablelm
+        qkv_bias = getattr(config, "use_qkv_bias", False)
         self.qkv_proj = QKVParallelLinear(
-            hidden_size,
+            self.hidden_size,
             self.head_dim,
             self.total_num_heads,
             self.total_num_kv_heads,
-            bias=False,
+            bias=bias or qkv_bias,
             linear_method=linear_method,
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
+            self.hidden_size,
+            bias=bias,
             linear_method=linear_method,
         )
 
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
+        # mistral
+        sliding_window = getattr(config, "sliding_window", None)
+
+        self.postion_embedding = getattr(config, "postion_embedding", "ROPE")
+        # Create the alibi slopes and slice them.
+        if self.postion_embedding == "ALIBI":
+            tp_rank = get_tensor_model_parallel_rank()
+            head_start = tp_rank * self.num_heads
+            head_end = (tp_rank + 1) * self.num_heads
+            alibi_slopes = _get_alibi_slopes(self.total_num_heads)
+            alibi_slopes = alibi_slopes[head_start:head_end].tolist()
+
+            self.attn = PagedAttention(self.num_heads,
+                                       self.head_dim,
+                                       self.scaling,
+                                       alibi_slopes=alibi_slopes,
+                                       sliding_window=sliding_window)
+        else:
+            rope_theta = getattr(config, "rope_theta", 10000)
+            rope_scaling = getattr(config, "rope_scaling", None)
+            # stablelm
+            rope_pct = getattr(config, "rope_pct", 1)
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=int(self.head_dim * rope_pct),
+                max_position=max_position_embeddings,
+                base=rope_theta,
+                rope_scaling=rope_scaling,
+            )
+            self.attn = PagedAttention(self.num_heads,
+                                       self.head_dim,
+                                       self.scaling,
+                                       num_kv_heads=self.num_kv_heads,
+                                       sliding_window=sliding_window)
 
     def forward(
         self,
@@ -151,7 +205,8 @@ def forward(
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
+        if self.postion_embedding != "ALIBI":
+            q, k = self.rotary_emb(positions, q, k)
         k_cache, v_cache = kv_cache
         attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
         output, _ = self.o_proj(attn_output)
@@ -164,32 +219,20 @@ def __init__(
         self,
         config: LlamaConfig,
         linear_method: Optional[LinearMethodBase] = None,
+        norm: Optional[torch.Tensor] = None,
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
         self.self_attn = LlamaAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
+            config,
             linear_method=linear_method,
         )
         self.mlp = LlamaMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
+            config,
             linear_method=linear_method,
         )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
+        self.input_layernorm = deepcopy(norm)
+        self.post_attention_layernorm = deepcopy(norm)
 
     def forward(
         self,
@@ -226,6 +269,7 @@ def __init__(
         self,
         config: LlamaConfig,
         linear_method: Optional[LinearMethodBase] = None,
+        norm: Optional[torch.Tensor] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
@@ -241,10 +285,10 @@ def __init__(
             org_num_embeddings=config.vocab_size,
         )
         self.layers = nn.ModuleList([
-            LlamaDecoderLayer(config, linear_method)
+            LlamaDecoderLayer(config, linear_method, norm)
             for _ in range(config.num_hidden_layers)
         ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm = norm
 
     def forward(
         self,
@@ -275,12 +319,18 @@ def __init__(
         self,
         config: LlamaConfig,
         linear_method: Optional[LinearMethodBase] = None,
+        norm: Optional[torch.Tensor] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
         self.config = config
         self.linear_method = linear_method
-        self.model = LlamaModel(config, linear_method, lora_config=lora_config)
+        if norm is None:
+            norm = RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.model = LlamaModel(config,
+                                linear_method,
+                                norm=norm,
+                                lora_config=lora_config)
         unpadded_vocab_size = config.vocab_size
         if lora_config:
             unpadded_vocab_size += lora_config.lora_extra_vocab_size
diff --git a/vllm/model_executor/models/mistral.py b/vllm/model_executor/models/mistral.py
deleted file mode 100644
index 01cde67844122..0000000000000
--- a/vllm/model_executor/models/mistral.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only Mistral model compatible with HuggingFace weights."""
-from typing import List, Optional, Tuple
-
-import torch
-from torch import nn
-from transformers import MistralConfig
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
-from vllm.sequence import SamplerOutput
-from vllm.config import LoRAConfig
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class MistralMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           linear_method=linear_method)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class MistralAttention(nn.Module):
-
-    def __init__(self,
-                 hidden_size: int,
-                 num_heads: int,
-                 num_kv_heads: int,
-                 max_position: int = 4096 * 32,
-                 rope_theta: float = 10000,
-                 linear_method: Optional[LinearMethodBase] = None,
-                 sliding_window: Optional[int] = None) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        if self.total_num_kv_heads >= tp_size:
-            # Number of KV heads is greater than TP size, so we partition
-            # the KV heads across multiple tensor parallel GPUs.
-            assert self.total_num_kv_heads % tp_size == 0
-        else:
-            # Number of KV heads is less than TP size, so we replicate
-            # the KV heads across multiple tensor parallel GPUs.
-            assert tp_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.sliding_window = sliding_window
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=False,
-            linear_method=linear_method,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            linear_method=linear_method,
-        )
-
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position,
-            base=self.rope_theta,
-        )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads,
-                                   sliding_window=self.sliding_window)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class MistralDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: MistralConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
-        self.self_attn = MistralAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            max_position=config.max_position_embeddings,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            linear_method=linear_method,
-            sliding_window=config.sliding_window)
-        self.mlp = MistralMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            linear_method=linear_method,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-
-
-class MistralModel(nn.Module):
-
-    def __init__(
-        self,
-        config: MistralConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-        lora_config: Optional[LoRAConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
-
-        self.embed_tokens = VocabParallelEmbedding(
-            self.vocab_size,
-            config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-        )
-        self.layers = nn.ModuleList([
-            MistralDecoderLayer(config, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                residual,
-            )
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-
-class MistralForCausalLM(nn.Module):
-    supports_lora = True
-
-    def __init__(
-        self,
-        config: MistralConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-        lora_config: Optional[LoRAConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.linear_method = linear_method
-        self.model = MistralModel(config,
-                                  linear_method,
-                                  lora_config=lora_config)
-        unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            unpadded_vocab_size += lora_config.lora_extra_vocab_size
-        self.lm_head = ParallelLMHead(
-            unpadded_vocab_size,
-            config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
-        )
-        self.sampler = Sampler(unpadded_vocab_size, config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata)
-        return hidden_states
-
-    def sample(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   sampling_metadata)
-        return next_tokens
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index fbc7320fb45a4..79ad182b1c06b 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -4,253 +4,33 @@
 # Copyright (c) Alibaba Cloud.
 # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
 """Inference-only QWen model compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Optional
 
-import torch
-from torch import nn
+from transformers import PretrainedConfig
+from vllm.config import LoRAConfig
 
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.linear import LinearMethodBase
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
-from vllm.sequence import SamplerOutput
-from vllm.transformers_utils.configs.qwen import QWenConfig
 
-KVCache = Tuple[torch.Tensor, torch.Tensor]
 
-
-class QWenMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str = "silu",
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            linear_method=linear_method)
-        self.c_proj = RowParallelLinear(intermediate_size,
-                                        hidden_size,
-                                        bias=False,
-                                        linear_method=linear_method)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.c_proj(x)
-        return x
-
-
-class QWenAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        max_position_embeddings: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
-        )
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = (self.total_num_heads //
-                          tensor_model_parallel_world_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.c_attn = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            bias=True,
-            linear_method=linear_method,
-        )
-        self.c_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            linear_method=linear_method,
-        )
-        self.scaling = self.head_dim**-0.5
-
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.c_attn(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-
-        output, _ = self.c_proj(attn_output)
-        return output
-
-
-class QWenBlock(nn.Module):
-
-    def __init__(
-        self,
-        config: QWenConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
-
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        self.attn = QWenAttention(config.hidden_size,
-                                  config.num_attention_heads,
-                                  config.max_position_embeddings,
-                                  rope_theta=rope_theta,
-                                  rope_scaling=rope_scaling,
-                                  linear_method=linear_method)
-
-        self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
-
-        self.mlp = QWenMLP(config.hidden_size,
-                           config.intermediate_size // 2,
-                           linear_method=linear_method)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.ln_1(hidden_states)
-        else:
-            hidden_states, residual = self.ln_1(hidden_states, residual)
-        hidden_states = self.attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.ln_2(hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-
-
-class QWenModel(nn.Module):
+class QWenLMHeadModel(LlamaForCausalLM):
 
     def __init__(
         self,
-        config: QWenConfig,
+        config: Optional[PretrainedConfig] = None,
         linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.vocab_size = config.vocab_size
-
-        self.wte = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.h = nn.ModuleList([
-            QWenBlock(config, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.wte(input_ids)
-        residual = None
-        for i in range(len(self.h)):
-            layer = self.h[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                residual,
-            )
-        hidden_states, _ = self.ln_f(hidden_states, residual)
-        return hidden_states
-
-
-class QWenLMHeadModel(nn.Module):
-
-    def __init__(
-        self,
-        config: QWenConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.linear_method = linear_method
-        self.transformer = QWenModel(config, linear_method)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         input_metadata)
-        return hidden_states
-
-    def sample(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   sampling_metadata)
-        return next_tokens
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        norm = RMSNorm(config.hidden_size, config.layer_norm_epsilon)
+        config.use_qkv_bias = True
+        config.intermediate_size = config.intermediate_size // 2
+        super().__init__(config=config,
+                         linear_method=linear_method,
+                         norm=norm,
+                         lora_config=lora_config)
 
     def load_weights(self,
                      model_name_or_path: str,
@@ -262,9 +42,24 @@ def load_weights(self,
             ("gate_up_proj", "w2", 0),
             ("gate_up_proj", "w1", 1),
         ]
+        param_weight_map = [
+            ("model", "transformer"),
+            (".self_attn.", ".attn."),
+            (".layers.", ".h."),
+            ("qkv_proj", "c_attn"),
+            (".self_attn.o_proj", ".self_attn.c_proj"),
+            ("norm", "ln_f"),
+            ("mlp.down_proj", "mlp.c_proj"),
+            ("input_layernorm", "ln_1"),
+            ("post_attention_layernorm", "ln_2"),
+            ("embed_tokens", "wte"),
+        ]
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in hf_model_weights_iterator(
                 model_name_or_path, cache_dir, load_format, revision):
+            for (param_name, weight_name) in param_weight_map:
+                name = name.replace(weight_name, param_name)
+
             if "rotary_emb.inv_freq" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 95e5ad8ede63e..6845384768129 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -17,283 +17,26 @@
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
 """Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights."""
-from typing import List, Optional, Tuple
+from typing import Optional
 
-import torch
-from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
-from vllm.sequence import SamplerOutput
+from vllm.model_executor.layers.linear import LinearMethodBase
+from vllm.model_executor.layers.layernorm import LayerNorm
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.config import LoRAConfig
 
-KVCache = Tuple[torch.Tensor, torch.Tensor]
 
-
-class StablelmMLP(nn.Module):
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 linear_method: Optional[LinearMethodBase] = None) -> None:
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_up_proj = MergedColumnParallelLinear(
-            config.hidden_size, [config.intermediate_size] * 2,
-            bias=False,
-            linear_method=linear_method)
-        self.down_proj = RowParallelLinear(config.intermediate_size,
-                                           config.hidden_size,
-                                           bias=False)
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class StablelmAttention(nn.Module):
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 linear_method: Optional[LinearMethodBase] = None) -> None:
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = config.num_attention_heads
-        self.num_heads = self.total_num_heads // tp_size
-
-        self.total_num_key_value_heads = config.num_key_value_heads
-        if self.total_num_key_value_heads >= tp_size:
-            # Number of KV heads is greater than TP size, so we partition
-            # the KV heads across multiple tensor parallel GPUs.
-            assert self.total_num_key_value_heads % tp_size == 0
-        else:
-            # Number of KV heads is less than TP size, so we replicate
-            # the KV heads across multiple tensor parallel GPUs.
-            assert tp_size % self.total_num_key_value_heads == 0
-        self.num_key_value_heads = max(
-            1, self.total_num_key_value_heads // tp_size)
-        self.head_dim = self.hidden_size // self.total_num_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rotary_ndims = int(self.head_dim * self.config.rope_pct)
-        self.scaling = self.head_dim**-0.5
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_key_value_heads * self.head_dim
-        self.qkv_bias = getattr(config, "use_qkv_bias", False)
-        if (self.head_dim * self.num_heads * tp_size) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads}).")
-
-        self.qkv_proj = QKVParallelLinear(self.hidden_size,
-                                          self.head_dim,
-                                          self.total_num_heads,
-                                          self.total_num_key_value_heads,
-                                          self.qkv_bias,
-                                          linear_method=linear_method)
-        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
-                                        self.hidden_size,
-                                        bias=False,
-                                        linear_method=linear_method)
-        self.rotary_ndims = int(self.head_dim * self.config.rope_pct)
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.rotary_ndims,
-            max_position=self.config.max_position_embeddings,
-            base=self.config.rope_theta,
-        )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_key_value_heads)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class StablelmDecoderLayer(nn.Module):
+class StablelmForCausalLM(LlamaForCausalLM):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Optional[PretrainedConfig] = None,
         linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
     ) -> None:
-        super().__init__()
-        self.self_attn = StablelmAttention(config)
-        self.mlp = StablelmMLP(config, linear_method)
-        self.input_layernorm = nn.LayerNorm(config.hidden_size,
-                                            eps=config.norm_eps)
-        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
-                                                     eps=config.norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        return hidden_states, residual
-
-
-class StableLMEpochModel(nn.Module):
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 linear_method: Optional[LinearMethodBase] = None) -> None:
-        super().__init__()
-        # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            StablelmDecoderLayer(config, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-            )
-        hidden_states = self.norm(hidden_states)
-        return hidden_states
-
-
-class StablelmForCausalLM(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.linear_method = linear_method
-        self.model = StableLMEpochModel(config, linear_method)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata)
-        return hidden_states
-
-    def sample(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   sampling_metadata)
-        return next_tokens
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+        norm = LayerNorm(config.hidden_size, config.norm_eps)
+        super().__init__(config=config,
+                         linear_method=linear_method,
+                         norm=norm,
+                         lora_config=lora_config)
diff --git a/vllm/model_executor/models/yi.py b/vllm/model_executor/models/yi.py
deleted file mode 100644
index 53daa6c4cd939..0000000000000
--- a/vllm/model_executor/models/yi.py
+++ /dev/null
@@ -1,330 +0,0 @@
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only Yi model (https://01.ai) compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional, Tuple
-
-import torch
-from torch import nn
-from vllm.transformers_utils.configs.yi import YiConfig
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
-from vllm.sequence import SamplerOutput
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class YiMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           linear_method=linear_method)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class YiAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        if self.total_num_kv_heads >= tp_size:
-            # Number of KV heads is greater than TP size, so we partition
-            # the KV heads across multiple tensor parallel GPUs.
-            assert self.total_num_kv_heads % tp_size == 0
-        else:
-            # Number of KV heads is less than TP size, so we replicate
-            # the KV heads across multiple tensor parallel GPUs.
-            assert tp_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=False,
-            linear_method=linear_method,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            linear_method=linear_method,
-        )
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class YiDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: YiConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = YiAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            linear_method=linear_method,
-        )
-        self.mlp = YiMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            linear_method=linear_method,
-        )
-        self.ln1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.ln2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.ln1(hidden_states)
-        else:
-            hidden_states, residual = self.ln1(hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.ln2(hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-
-
-class YiModel(nn.Module):
-
-    def __init__(
-        self,
-        config: YiConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            YiDecoderLayer(config, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                residual,
-            )
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-
-class YiForCausalLM(nn.Module):
-
-    def __init__(
-        self,
-        config: YiConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.linear_method = linear_method
-        self.model = YiModel(config, linear_method)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata)
-        return hidden_states
-
-    def sample(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   sampling_metadata)
-        return next_tokens
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 8b16e559b24f2..bc27784087aa7 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -5,14 +5,10 @@
 from vllm.transformers_utils.configs import *
 
 _CONFIG_REGISTRY = {
-    "aquila": AquilaConfig,
-    "baichuan": BaiChuanConfig,
     "chatglm": ChatGLMConfig,
     "mpt": MPTConfig,
-    "qwen": QWenConfig,
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
-    "yi": YiConfig,
 }
 
 
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 284867414e0ed..ef955f75cedaa 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,20 +1,12 @@
-from vllm.transformers_utils.configs.aquila import AquilaConfig
-from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
-from vllm.transformers_utils.configs.qwen import QWenConfig
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
-from vllm.transformers_utils.configs.yi import YiConfig
 
 __all__ = [
-    "AquilaConfig",
-    "BaiChuanConfig",
     "ChatGLMConfig",
     "MPTConfig",
-    "QWenConfig",
     "RWConfig",
-    "YiConfig",
 ]
diff --git a/vllm/transformers_utils/configs/aquila.py b/vllm/transformers_utils/configs/aquila.py
deleted file mode 100644
index 86a6f2ba304af..0000000000000
--- a/vllm/transformers_utils/configs/aquila.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# coding=utf-8
-# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Aquila model configuration"""
-
-from transformers import PretrainedConfig
-
-
-class AquilaConfig(PretrainedConfig):
-    model_type = "aquila"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=100008,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.006,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/vllm/transformers_utils/configs/baichuan.py b/vllm/transformers_utils/configs/baichuan.py
deleted file mode 100644
index 869817525c11a..0000000000000
--- a/vllm/transformers_utils/configs/baichuan.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from transformers.configuration_utils import PretrainedConfig
-
-
-class BaiChuanConfig(PretrainedConfig):
-    model_type = "baichuan"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=64000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=4096,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/vllm/transformers_utils/configs/qwen.py b/vllm/transformers_utils/configs/qwen.py
deleted file mode 100644
index bb033a337ad04..0000000000000
--- a/vllm/transformers_utils/configs/qwen.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) Alibaba Cloud.
-# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
-
-from transformers import PretrainedConfig
-
-
-class QWenConfig(PretrainedConfig):
-    model_type = "qwen"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        emb_dropout_prob=0.0,
-        attn_dropout_prob=0.0,
-        layer_norm_epsilon=1e-6,
-        initializer_range=0.02,
-        max_position_embeddings=8192,
-        scale_attn_weights=True,
-        use_cache=True,
-        bf16=False,
-        fp16=False,
-        fp32=False,
-        kv_channels=128,
-        rotary_pct=1.0,
-        rotary_emb_base=10000,
-        use_dynamic_ntk=True,
-        use_logn_attn=True,
-        use_flash_attn="auto",
-        intermediate_size=22016,
-        no_bias=True,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.emb_dropout_prob = emb_dropout_prob
-        self.attn_dropout_prob = attn_dropout_prob
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.scale_attn_weights = scale_attn_weights
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.bf16 = bf16
-        self.fp16 = fp16
-        self.fp32 = fp32
-        self.kv_channels = kv_channels
-        self.rotary_pct = rotary_pct
-        self.rotary_emb_base = rotary_emb_base
-        self.use_dynamic_ntk = use_dynamic_ntk
-        self.use_logn_attn = use_logn_attn
-        self.use_flash_attn = use_flash_attn
-        self.no_bias = no_bias
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
diff --git a/vllm/transformers_utils/configs/yi.py b/vllm/transformers_utils/configs/yi.py
deleted file mode 100644
index 359922ed26952..0000000000000
--- a/vllm/transformers_utils/configs/yi.py
+++ /dev/null
@@ -1,64 +0,0 @@
-""" Yi model configuration"""
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-Yi_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-
-
-class YiConfig(PretrainedConfig):
-    r"""
-        Reference:
-        https://huggingface.co/01-ai/Yi-6B/blob/main/configuration_yi.py
-    """
-    model_type = "Yi"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=64000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=4,
-        hidden_act="silu",
-        max_position_embeddings=4096,
-        initializer_range=0.02,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        output_attentions=False,
-        rope_theta=5000000.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.output_attentions = output_attentions
-        self.rope_theta = rope_theta
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )

From ea356004d4749627bd1c65b7f71c76f51b5c45be Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Tue, 13 Feb 2024 09:24:59 -0800
Subject: [PATCH 057/112] Revert "Refactor llama family models (#2637)" (#2851)

This reverts commit 5c976a7e1a1bec875bf6474824b7dff39e38de18.
---
 vllm/model_executor/layers/layernorm.py     |  25 --
 vllm/model_executor/models/__init__.py      |   9 +-
 vllm/model_executor/models/aquila.py        | 342 +++++++++++++++++++
 vllm/model_executor/models/baichuan.py      | 335 +++++++++++++++++--
 vllm/model_executor/models/internlm.py      | 299 +++++++++++++++++
 vllm/model_executor/models/internlm2.py     | 285 ++++++++++++++--
 vllm/model_executor/models/llama.py         | 162 ++++-----
 vllm/model_executor/models/mistral.py       | 352 ++++++++++++++++++++
 vllm/model_executor/models/qwen.py          | 267 +++++++++++++--
 vllm/model_executor/models/stablelm.py      | 283 +++++++++++++++-
 vllm/model_executor/models/yi.py            | 330 ++++++++++++++++++
 vllm/transformers_utils/config.py           |   4 +
 vllm/transformers_utils/configs/__init__.py |   8 +
 vllm/transformers_utils/configs/aquila.py   |  69 ++++
 vllm/transformers_utils/configs/baichuan.py |  62 ++++
 vllm/transformers_utils/configs/qwen.py     |  60 ++++
 vllm/transformers_utils/configs/yi.py       |  64 ++++
 17 files changed, 2720 insertions(+), 236 deletions(-)
 create mode 100644 vllm/model_executor/models/aquila.py
 create mode 100644 vllm/model_executor/models/internlm.py
 create mode 100644 vllm/model_executor/models/mistral.py
 create mode 100644 vllm/model_executor/models/yi.py
 create mode 100644 vllm/transformers_utils/configs/aquila.py
 create mode 100644 vllm/transformers_utils/configs/baichuan.py
 create mode 100644 vllm/transformers_utils/configs/qwen.py
 create mode 100644 vllm/transformers_utils/configs/yi.py

diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 83bc189ee4d90..cb3cee2bad5ad 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -7,31 +7,6 @@
 from vllm._C import ops
 
 
-class LayerNorm(nn.LayerNorm):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        eps: float = 1e-6,
-    ) -> None:
-        super().__init__(hidden_size, eps=eps)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """normalization."""
-        if residual is not None:
-            x = x + residual
-            residual = x
-        x = super().forward(x)
-        if residual is None:
-            return x
-        else:
-            return x, residual
-
-
 class RMSNorm(nn.Module):
     """Root mean square normalization.
 
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 7c1ce1bf1fc60..fb519b3c0cf92 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -10,8 +10,8 @@
 
 # Architecture -> (module, class).
 _MODELS = {
-    "AquilaModel": ("llama", "LlamaForCausalLM"),
-    "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
+    "AquilaModel": ("aquila", "AquilaForCausalLM"),
+    "AquilaForCausalLM": ("aquila", "AquilaForCausalLM"),  # AquilaChat2
     "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),  # baichuan-7b
     "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),  # baichuan-13b
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
@@ -24,12 +24,12 @@
     "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
     "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
-    "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
+    "InternLMForCausalLM": ("internlm", "InternLMForCausalLM"),
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     # For decapoda-research/llama-*
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
-    "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
+    "MistralForCausalLM": ("mistral", "MistralForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
     "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
     # transformers's mpt class has lower case
@@ -41,6 +41,7 @@
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
     "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
+    "YiForCausalLM": ("yi", "YiForCausalLM")
 }
 
 # Models not supported by ROCm.
diff --git a/vllm/model_executor/models/aquila.py b/vllm/model_executor/models/aquila.py
new file mode 100644
index 0000000000000..2f2bd5ffb4a63
--- /dev/null
+++ b/vllm/model_executor/models/aquila.py
@@ -0,0 +1,342 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.weight_utils import (default_weight_loader,
+                                              hf_model_weights_iterator)
+from vllm.sequence import SamplerOutput
+from vllm.transformers_utils.configs.aquila import AquilaConfig
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class AquilaMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            linear_method=linear_method)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           linear_method=linear_method)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class AquilaRMSNorm(nn.Module):
+
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        AquilaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1,
+                                                               keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance +
+                                                    self.variance_epsilon)
+
+        return (self.weight * hidden_states).to(input_dtype)
+
+
+class AquilaAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        max_position_embeddings: int = 8192,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        assert self.total_num_kv_heads % tp_size == 0
+        self.num_kv_heads = self.total_num_kv_heads // tp_size
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   self.scaling,
+                                   num_kv_heads=self.num_kv_heads)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class AquilaDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: AquilaConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = AquilaAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            max_position_embeddings=max_position_embeddings,
+            rope_scaling=rope_scaling,
+            linear_method=linear_method,
+        )
+        self.mlp = AquilaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            linear_method=linear_method,
+        )
+        self.input_layernorm = AquilaRMSNorm(config.hidden_size,
+                                             eps=config.rms_norm_eps)
+        self.post_attention_layernorm = AquilaRMSNorm(config.hidden_size,
+                                                      eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class AquilaModel(nn.Module):
+
+    def __init__(
+        self,
+        config: AquilaConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            AquilaDecoderLayer(config, linear_method)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+            )
+        hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class AquilaForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = AquilaModel(config, linear_method)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   sampling_metadata)
+        return next_tokens
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     load_format: str = "auto",
+                     revision: Optional[str] = None):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, load_format, revision):
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index e0f826bf7e29a..f08c3c8d257ff 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -18,19 +18,305 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only BaiChuan model compatible with HuggingFace weights."""
-from typing import Optional
+import math
+from typing import List, Optional, Tuple
 
 import torch
-from transformers import PretrainedConfig
-from vllm.config import LoRAConfig
+from torch import nn
 
-from vllm.model_executor.layers.linear import LinearMethodBase
-from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
+from vllm.sequence import SamplerOutput
+from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
 
+KVCache = Tuple[torch.Tensor, torch.Tensor]
 
-class BaiChuanBaseForCausalLM(LlamaForCausalLM):
+
+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
+    base = torch.tensor(
+        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+        dtype=torch.float32,
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != total_num_heads:
+        extra_base = torch.tensor(
+            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+            dtype=torch.float32,
+        )
+        num_remaining_heads = min(closest_power_of_2,
+                                  total_num_heads - closest_power_of_2)
+        extra_powers = torch.arange(start=1,
+                                    end=1 + 2 * num_remaining_heads,
+                                    step=2,
+                                    dtype=torch.int32)
+        slopes = torch.cat(
+            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
+    return slopes
+
+
+class BaiChuanMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            linear_method=linear_method)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           linear_method=linear_method)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class BaiChuanAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        position_embedding: str,
+        rope_theta: float = 10000,
+        max_position_embeddings: int = 8192,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
+        )
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.postion_embedding = position_embedding
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        # pylint: disable=invalid-name
+        self.W_pack = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_heads,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            linear_method=linear_method,
+        )
+        # Create the alibi slopes and slice them.
+        if self.postion_embedding == "ALIBI":
+            tp_rank = get_tensor_model_parallel_rank()
+            head_start = tp_rank * self.num_heads
+            head_end = (tp_rank + 1) * self.num_heads
+            alibi_slopes = _get_alibi_slopes(self.total_num_heads)
+            alibi_slopes = alibi_slopes[head_start:head_end].tolist()
+
+            scaling = self.head_dim**-0.5
+            self.attn = PagedAttention(self.num_heads,
+                                       self.head_dim,
+                                       scaling,
+                                       alibi_slopes=alibi_slopes)
+        else:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+            self.scaling = self.head_dim**-0.5
+            self.attn = PagedAttention(self.num_heads, self.head_dim,
+                                       self.scaling)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.W_pack(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        if self.postion_embedding != "ALIBI":
+            q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class BaiChuanDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: BaiChuanConfig,
+                 position_embedding: str,
+                 linear_method: Optional[LinearMethodBase] = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = BaiChuanAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            position_embedding=position_embedding,
+            rope_theta=rope_theta,
+            max_position_embeddings=max_position_embeddings,
+            linear_method=linear_method,
+        )
+        self.mlp = BaiChuanMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            linear_method=linear_method,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class BaiChuanModel(nn.Module):
+
+    def __init__(self,
+                 config: BaiChuanConfig,
+                 position_embedding: str,
+                 linear_method: Optional[LinearMethodBase] = None):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            BaiChuanDecoderLayer(config, position_embedding, linear_method)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class BaiChuanBaseForCausalLM(nn.Module):
+
+    def __init__(self,
+                 config,
+                 position_embedding: str,
+                 linear_method: Optional[LinearMethodBase] = None):
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = BaiChuanModel(config, position_embedding, linear_method)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   sampling_metadata)
+        return next_tokens
 
     def load_weights(self,
                      model_name_or_path: str,
@@ -42,15 +328,9 @@ def load_weights(self,
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
         ]
-        param_weight_map = [
-            ("qkv_proj", "W_pack"),
-        ]
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in hf_model_weights_iterator(
                 model_name_or_path, cache_dir, load_format, revision):
-            for (param_name, weight_name) in param_weight_map:
-                name = name.replace(weight_name, param_name)
-
             if "rotary_emb.inv_freq" in name:
                 continue
             if name == "lm_head.weight":
@@ -88,28 +368,19 @@ def load_weights(self,
 class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
     """Baichuan 13B and Baichuan2 7B/13B."""
 
-    def __init__(
-        self,
-        config: Optional[PretrainedConfig] = None,
-        linear_method: Optional[LinearMethodBase] = None,
-        lora_config: Optional[LoRAConfig] = None,
-    ) -> None:
-        if config.hidden_size != 4096:  # baichuan 13b, baichuan2 13b
-            config.postion_embedding = "ALIBI"
-        super().__init__(config=config,
-                         linear_method=linear_method,
-                         lora_config=lora_config)
+    def __init__(self,
+                 config,
+                 linear_method: Optional[LinearMethodBase] = None):
+        if config.hidden_size == 4096:  # baichuan2 7b
+            super().__init__(config, "ROPE", linear_method)
+        else:  # baichuan 13b, baichuan2 13b
+            super().__init__(config, "ALIBI", linear_method)
 
 
 class BaiChuanForCausalLM(BaiChuanBaseForCausalLM):
     """Baichuan 7B."""
 
-    def __init__(
-        self,
-        config: Optional[PretrainedConfig] = None,
-        linear_method: Optional[LinearMethodBase] = None,
-        lora_config: Optional[LoRAConfig] = None,
-    ) -> None:
-        super().__init__(config=config,
-                         linear_method=linear_method,
-                         lora_config=lora_config)
+    def __init__(self,
+                 config,
+                 linear_method: Optional[LinearMethodBase] = None):
+        super().__init__(config, "ROPE", linear_method)
diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py
new file mode 100644
index 0000000000000..5d0b93793c89d
--- /dev/null
+++ b/vllm/model_executor/models/internlm.py
@@ -0,0 +1,299 @@
+# -*- coding: utf-8 -*-
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.weight_utils import (default_weight_loader,
+                                              hf_model_weights_iterator)
+from vllm.sequence import SamplerOutput
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class InternLMMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            linear_method=linear_method)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           linear_method=linear_method)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class InternLMAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        bias: bool,
+        rope_theta: float = 10000,
+        max_position_embeddings: int = 8192,
+        linear_method: Optional[LinearMethodBase] = None,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            bias=bias,
+            linear_method=linear_method,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=bias,
+            linear_method=linear_method,
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class InternLMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = InternLMAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            bias=config.bias,
+            rope_theta=rope_theta,
+            max_position_embeddings=max_position_embeddings,
+            linear_method=linear_method,
+            rope_scaling=getattr(config, "rope_scaling", None),
+        )
+        self.mlp = InternLMMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            linear_method=linear_method,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class InternLMModel(nn.Module):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        vocab_size = ((config.vocab_size + 63) // 64) * 64
+        self.embed_tokens = VocabParallelEmbedding(
+            vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            InternLMDecoderLayer(config, linear_method)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class InternLMForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = InternLMModel(config, linear_method)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   sampling_metadata)
+        return next_tokens
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     load_format: str = "auto",
+                     revision: Optional[str] = None):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, load_format, revision):
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index bffe3ae2408cc..ebf1d8a89a022 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,27 +1,276 @@
 # -*- coding: utf-8 -*-
-from typing import Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
+from torch import nn
 from transformers import PretrainedConfig
-from vllm.config import LoRAConfig
 
-from vllm.model_executor.layers.linear import LinearMethodBase
-from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
+from vllm.sequence import SamplerOutput
 
+KVCache = Tuple[torch.Tensor, torch.Tensor]
 
-class InternLM2ForCausalLM(LlamaForCausalLM):
+
+class InternLM2MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            linear_method=linear_method)
+        self.w2 = RowParallelLinear(intermediate_size,
+                                    hidden_size,
+                                    bias=False,
+                                    linear_method=linear_method)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.w2(x)
+        return x
+
+
+class InternLM2Attention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.wqkv = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.wo = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            linear_method=linear_method,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   self.scaling,
+                                   num_kv_heads=self.num_kv_heads)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.wqkv(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        output, _ = self.wo(attn_output)
+        return output
+
+
+class InternLMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.attention = InternLM2Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            linear_method=linear_method,
+        )
+        self.feed_forward = InternLM2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            linear_method=linear_method,
+        )
+        self.attention_norm = RMSNorm(config.hidden_size,
+                                      eps=config.rms_norm_eps)
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.attention_norm(hidden_states)
+        else:
+            hidden_states, residual = self.attention_norm(
+                hidden_states, residual)
+        hidden_states = self.attention(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.ffn_norm(hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+class InternLM2Model(nn.Module):
 
     def __init__(
         self,
-        config: Optional[PretrainedConfig] = None,
+        config: PretrainedConfig,
         linear_method: Optional[LinearMethodBase] = None,
-        lora_config: Optional[LoRAConfig] = None,
     ) -> None:
-        super().__init__(config=config,
-                         linear_method=linear_method,
-                         lora_config=lora_config)
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.tok_embeddings = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            InternLMDecoderLayer(config, linear_method)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.tok_embeddings(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class InternLM2ForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = InternLM2Model(config, linear_method)
+        self.output = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.output.weight, hidden_states,
+                                   sampling_metadata)
+        return next_tokens
 
     def load_weights(self,
                      model_name_or_path: str,
@@ -33,23 +282,9 @@ def load_weights(self,
             ("gate_up_proj", "w1", 0),
             ("gate_up_proj", "w3", 1),
         ]
-        param_weight_map = [
-            ("qkv_proj", "wqkv"),
-            ("o_proj", "wo"),
-            ("down_proj", "w2"),
-            ("input_layernorm", "attention_norm"),
-            ("post_attention_layernorm", "ffn_norm"),
-            ("embed_tokens", "tok_embeddings"),
-            (".self_attn.", ".attention."),
-            ("mlp", "feed_forward"),
-            ("lm_head", "output"),
-        ]
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in hf_model_weights_iterator(
                 model_name_or_path, cache_dir, load_format, revision):
-            for (param_name, weight_name) in param_weight_map:
-                name = name.replace(weight_name, param_name)
-
             if "rotary_emb.inv_freq" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
@@ -68,7 +303,7 @@ def load_weights(self,
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 param = params_dict[name]
-                if "qkv_proj" in name:
+                if "wqkv" in name:
                     config = self.config
                     kv_groups = config.num_attention_heads // config.num_key_value_heads
                     head_dim = config.hidden_size // config.num_attention_heads
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 462469ef79d1c..e5a1abebf1420 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -21,9 +21,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
-import math
 import torch
 from torch import nn
 from transformers import LlamaConfig
@@ -41,60 +40,34 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE)
 from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+    get_tensor_model_parallel_world_size)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
 from vllm.config import LoRAConfig
 
-from copy import deepcopy
-
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
 
-def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
-    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
-    base = torch.tensor(
-        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
-        dtype=torch.float32,
-    )
-    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
-    slopes = torch.pow(base, powers)
-
-    if closest_power_of_2 != total_num_heads:
-        extra_base = torch.tensor(
-            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
-            dtype=torch.float32,
-        )
-        num_remaining_heads = min(closest_power_of_2,
-                                  total_num_heads - closest_power_of_2)
-        extra_powers = torch.arange(start=1,
-                                    end=1 + 2 * num_remaining_heads,
-                                    step=2,
-                                    dtype=torch.int32)
-        slopes = torch.cat(
-            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
-    return slopes
-
-
 class LlamaMLP(nn.Module):
 
     def __init__(
         self,
-        config: LlamaConfig,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
         linear_method: Optional[LinearMethodBase] = None,
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            config.hidden_size, [config.intermediate_size] * 2,
+            hidden_size, [intermediate_size] * 2,
             bias=False,
             linear_method=linear_method)
-        self.down_proj = RowParallelLinear(config.intermediate_size,
-                                           config.hidden_size,
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
                                            bias=False,
                                            linear_method=linear_method)
-        hidden_act = getattr(config, "hidden_act", "silu")
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
@@ -111,19 +84,21 @@ class LlamaAttention(nn.Module):
 
     def __init__(
         self,
-        config: LlamaConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
         linear_method: Optional[LinearMethodBase] = None,
     ) -> None:
         super().__init__()
-        self.hidden_size = config.hidden_size
+        self.hidden_size = hidden_size
         tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = getattr(config, "num_attention_heads", None)
+        self.total_num_heads = num_heads
         assert self.total_num_heads % tp_size == 0
         self.num_heads = self.total_num_heads // tp_size
-
-        # defaut to mha
-        self.total_num_kv_heads = getattr(config, "num_key_value_heads",
-                                          self.total_num_heads)
+        self.total_num_kv_heads = num_kv_heads
         if self.total_num_kv_heads >= tp_size:
             # Number of KV heads is greater than TP size, so we partition
             # the KV heads across multiple tensor parallel GPUs.
@@ -133,68 +108,39 @@ def __init__(
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_size % self.total_num_kv_heads == 0
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = self.hidden_size // self.total_num_heads
+        self.head_dim = hidden_size // self.total_num_heads
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
 
-        # internlm
-        bias = getattr(config, "bias", False)
-
-        # stablelm
-        qkv_bias = getattr(config, "use_qkv_bias", False)
         self.qkv_proj = QKVParallelLinear(
-            self.hidden_size,
+            hidden_size,
             self.head_dim,
             self.total_num_heads,
             self.total_num_kv_heads,
-            bias=bias or qkv_bias,
+            bias=False,
             linear_method=linear_method,
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
-            self.hidden_size,
-            bias=bias,
+            hidden_size,
+            bias=False,
             linear_method=linear_method,
         )
 
-        # mistral
-        sliding_window = getattr(config, "sliding_window", None)
-
-        self.postion_embedding = getattr(config, "postion_embedding", "ROPE")
-        # Create the alibi slopes and slice them.
-        if self.postion_embedding == "ALIBI":
-            tp_rank = get_tensor_model_parallel_rank()
-            head_start = tp_rank * self.num_heads
-            head_end = (tp_rank + 1) * self.num_heads
-            alibi_slopes = _get_alibi_slopes(self.total_num_heads)
-            alibi_slopes = alibi_slopes[head_start:head_end].tolist()
-
-            self.attn = PagedAttention(self.num_heads,
-                                       self.head_dim,
-                                       self.scaling,
-                                       alibi_slopes=alibi_slopes,
-                                       sliding_window=sliding_window)
-        else:
-            rope_theta = getattr(config, "rope_theta", 10000)
-            rope_scaling = getattr(config, "rope_scaling", None)
-            # stablelm
-            rope_pct = getattr(config, "rope_pct", 1)
-            self.rotary_emb = get_rope(
-                self.head_dim,
-                rotary_dim=int(self.head_dim * rope_pct),
-                max_position=max_position_embeddings,
-                base=rope_theta,
-                rope_scaling=rope_scaling,
-            )
-            self.attn = PagedAttention(self.num_heads,
-                                       self.head_dim,
-                                       self.scaling,
-                                       num_kv_heads=self.num_kv_heads,
-                                       sliding_window=sliding_window)
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   self.scaling,
+                                   num_kv_heads=self.num_kv_heads)
 
     def forward(
         self,
@@ -205,8 +151,7 @@ def forward(
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        if self.postion_embedding != "ALIBI":
-            q, k = self.rotary_emb(positions, q, k)
+        q, k = self.rotary_emb(positions, q, k)
         k_cache, v_cache = kv_cache
         attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
         output, _ = self.o_proj(attn_output)
@@ -219,20 +164,32 @@ def __init__(
         self,
         config: LlamaConfig,
         linear_method: Optional[LinearMethodBase] = None,
-        norm: Optional[torch.Tensor] = None,
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
         self.self_attn = LlamaAttention(
-            config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
             linear_method=linear_method,
         )
         self.mlp = LlamaMLP(
-            config,
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
             linear_method=linear_method,
         )
-        self.input_layernorm = deepcopy(norm)
-        self.post_attention_layernorm = deepcopy(norm)
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -269,7 +226,6 @@ def __init__(
         self,
         config: LlamaConfig,
         linear_method: Optional[LinearMethodBase] = None,
-        norm: Optional[torch.Tensor] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
@@ -285,10 +241,10 @@ def __init__(
             org_num_embeddings=config.vocab_size,
         )
         self.layers = nn.ModuleList([
-            LlamaDecoderLayer(config, linear_method, norm)
+            LlamaDecoderLayer(config, linear_method)
             for _ in range(config.num_hidden_layers)
         ])
-        self.norm = norm
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -319,18 +275,12 @@ def __init__(
         self,
         config: LlamaConfig,
         linear_method: Optional[LinearMethodBase] = None,
-        norm: Optional[torch.Tensor] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
         self.config = config
         self.linear_method = linear_method
-        if norm is None:
-            norm = RMSNorm(config.hidden_size, config.rms_norm_eps)
-        self.model = LlamaModel(config,
-                                linear_method,
-                                norm=norm,
-                                lora_config=lora_config)
+        self.model = LlamaModel(config, linear_method, lora_config=lora_config)
         unpadded_vocab_size = config.vocab_size
         if lora_config:
             unpadded_vocab_size += lora_config.lora_extra_vocab_size
diff --git a/vllm/model_executor/models/mistral.py b/vllm/model_executor/models/mistral.py
new file mode 100644
index 0000000000000..01cde67844122
--- /dev/null
+++ b/vllm/model_executor/models/mistral.py
@@ -0,0 +1,352 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Mistral model compatible with HuggingFace weights."""
+from typing import List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import MistralConfig
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.weight_utils import (default_weight_loader,
+                                              hf_model_weights_iterator)
+from vllm.sequence import SamplerOutput
+from vllm.config import LoRAConfig
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class MistralMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            linear_method=linear_method)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           linear_method=linear_method)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MistralAttention(nn.Module):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 max_position: int = 4096 * 32,
+                 rope_theta: float = 10000,
+                 linear_method: Optional[LinearMethodBase] = None,
+                 sliding_window: Optional[int] = None) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.sliding_window = sliding_window
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            linear_method=linear_method,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=self.rope_theta,
+        )
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   self.scaling,
+                                   num_kv_heads=self.num_kv_heads,
+                                   sliding_window=self.sliding_window)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MistralDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: MistralConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = MistralAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            linear_method=linear_method,
+            sliding_window=config.sliding_window)
+        self.mlp = MistralMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            linear_method=linear_method,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class MistralModel(nn.Module):
+
+    def __init__(
+        self,
+        config: MistralConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+        self.layers = nn.ModuleList([
+            MistralDecoderLayer(config, linear_method)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class MistralForCausalLM(nn.Module):
+    supports_lora = True
+
+    def __init__(
+        self,
+        config: MistralConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = MistralModel(config,
+                                  linear_method,
+                                  lora_config=lora_config)
+        unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        self.sampler = Sampler(unpadded_vocab_size, config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   sampling_metadata)
+        return next_tokens
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     load_format: str = "auto",
+                     revision: Optional[str] = None):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, load_format, revision):
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 79ad182b1c06b..fbc7320fb45a4 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -4,33 +4,253 @@
 # Copyright (c) Alibaba Cloud.
 # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
 """Inference-only QWen model compatible with HuggingFace weights."""
-from typing import Optional
+from typing import Any, Dict, List, Optional, Tuple
 
-from transformers import PretrainedConfig
-from vllm.config import LoRAConfig
+import torch
+from torch import nn
 
-from vllm.model_executor.layers.linear import LinearMethodBase
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
+from vllm.sequence import SamplerOutput
+from vllm.transformers_utils.configs.qwen import QWenConfig
 
+KVCache = Tuple[torch.Tensor, torch.Tensor]
 
-class QWenLMHeadModel(LlamaForCausalLM):
+
+class QWenMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str = "silu",
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            linear_method=linear_method)
+        self.c_proj = RowParallelLinear(intermediate_size,
+                                        hidden_size,
+                                        bias=False,
+                                        linear_method=linear_method)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.c_proj(x)
+        return x
+
+
+class QWenAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        max_position_embeddings: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
+        )
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.c_attn = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            bias=True,
+            linear_method=linear_method,
+        )
+        self.c_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.scaling = self.head_dim**-0.5
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.c_attn(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+
+        output, _ = self.c_proj(attn_output)
+        return output
+
+
+class QWenBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: QWenConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        self.attn = QWenAttention(config.hidden_size,
+                                  config.num_attention_heads,
+                                  config.max_position_embeddings,
+                                  rope_theta=rope_theta,
+                                  rope_scaling=rope_scaling,
+                                  linear_method=linear_method)
+
+        self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+        self.mlp = QWenMLP(config.hidden_size,
+                           config.intermediate_size // 2,
+                           linear_method=linear_method)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.ln_1(hidden_states)
+        else:
+            hidden_states, residual = self.ln_1(hidden_states, residual)
+        hidden_states = self.attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.ln_2(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class QWenModel(nn.Module):
 
     def __init__(
         self,
-        config: Optional[PretrainedConfig] = None,
+        config: QWenConfig,
         linear_method: Optional[LinearMethodBase] = None,
-        lora_config: Optional[LoRAConfig] = None,
-    ) -> None:
-        norm = RMSNorm(config.hidden_size, config.layer_norm_epsilon)
-        config.use_qkv_bias = True
-        config.intermediate_size = config.intermediate_size // 2
-        super().__init__(config=config,
-                         linear_method=linear_method,
-                         norm=norm,
-                         lora_config=lora_config)
+    ):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+
+        self.wte = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.h = nn.ModuleList([
+            QWenBlock(config, linear_method)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.wte(input_ids)
+        residual = None
+        for i in range(len(self.h)):
+            layer = self.h[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                residual,
+            )
+        hidden_states, _ = self.ln_f(hidden_states, residual)
+        return hidden_states
+
+
+class QWenLMHeadModel(nn.Module):
+
+    def __init__(
+        self,
+        config: QWenConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.transformer = QWenModel(config, linear_method)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.transformer(input_ids, positions, kv_caches,
+                                         input_metadata)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   sampling_metadata)
+        return next_tokens
 
     def load_weights(self,
                      model_name_or_path: str,
@@ -42,24 +262,9 @@ def load_weights(self,
             ("gate_up_proj", "w2", 0),
             ("gate_up_proj", "w1", 1),
         ]
-        param_weight_map = [
-            ("model", "transformer"),
-            (".self_attn.", ".attn."),
-            (".layers.", ".h."),
-            ("qkv_proj", "c_attn"),
-            (".self_attn.o_proj", ".self_attn.c_proj"),
-            ("norm", "ln_f"),
-            ("mlp.down_proj", "mlp.c_proj"),
-            ("input_layernorm", "ln_1"),
-            ("post_attention_layernorm", "ln_2"),
-            ("embed_tokens", "wte"),
-        ]
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in hf_model_weights_iterator(
                 model_name_or_path, cache_dir, load_format, revision):
-            for (param_name, weight_name) in param_weight_map:
-                name = name.replace(weight_name, param_name)
-
             if "rotary_emb.inv_freq" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 6845384768129..95e5ad8ede63e 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -17,26 +17,283 @@
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
 """Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights."""
-from typing import Optional
+from typing import List, Optional, Tuple
 
+import torch
+from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.model_executor.layers.linear import LinearMethodBase
-from vllm.model_executor.layers.layernorm import LayerNorm
-from vllm.model_executor.models.llama import LlamaForCausalLM
-from vllm.config import LoRAConfig
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.weight_utils import (default_weight_loader,
+                                              hf_model_weights_iterator)
+from vllm.sequence import SamplerOutput
 
+KVCache = Tuple[torch.Tensor, torch.Tensor]
 
-class StablelmForCausalLM(LlamaForCausalLM):
+
+class StablelmMLP(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 linear_method: Optional[LinearMethodBase] = None) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            config.hidden_size, [config.intermediate_size] * 2,
+            bias=False,
+            linear_method=linear_method)
+        self.down_proj = RowParallelLinear(config.intermediate_size,
+                                           config.hidden_size,
+                                           bias=False)
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class StablelmAttention(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 linear_method: Optional[LinearMethodBase] = None) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        self.num_heads = self.total_num_heads // tp_size
+
+        self.total_num_key_value_heads = config.num_key_value_heads
+        if self.total_num_key_value_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_key_value_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_key_value_heads == 0
+        self.num_key_value_heads = max(
+            1, self.total_num_key_value_heads // tp_size)
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rotary_ndims = int(self.head_dim * self.config.rope_pct)
+        self.scaling = self.head_dim**-0.5
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_key_value_heads * self.head_dim
+        self.qkv_bias = getattr(config, "use_qkv_bias", False)
+        if (self.head_dim * self.num_heads * tp_size) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads}).")
+
+        self.qkv_proj = QKVParallelLinear(self.hidden_size,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          self.total_num_key_value_heads,
+                                          self.qkv_bias,
+                                          linear_method=linear_method)
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        self.hidden_size,
+                                        bias=False,
+                                        linear_method=linear_method)
+        self.rotary_ndims = int(self.head_dim * self.config.rope_pct)
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_ndims,
+            max_position=self.config.max_position_embeddings,
+            base=self.config.rope_theta,
+        )
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   self.scaling,
+                                   num_kv_heads=self.num_key_value_heads)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class StablelmDecoderLayer(nn.Module):
 
     def __init__(
         self,
-        config: Optional[PretrainedConfig] = None,
+        config: PretrainedConfig,
         linear_method: Optional[LinearMethodBase] = None,
-        lora_config: Optional[LoRAConfig] = None,
     ) -> None:
-        norm = LayerNorm(config.hidden_size, config.norm_eps)
-        super().__init__(config=config,
-                         linear_method=linear_method,
-                         norm=norm,
-                         lora_config=lora_config)
+        super().__init__()
+        self.self_attn = StablelmAttention(config)
+        self.mlp = StablelmMLP(config, linear_method)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=config.norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, residual
+
+
+class StableLMEpochModel(nn.Module):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 linear_method: Optional[LinearMethodBase] = None) -> None:
+        super().__init__()
+        # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            StablelmDecoderLayer(config, linear_method)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class StablelmForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = StableLMEpochModel(config, linear_method)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   sampling_metadata)
+        return next_tokens
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     load_format: str = "auto",
+                     revision: Optional[str] = None):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, load_format, revision):
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/yi.py b/vllm/model_executor/models/yi.py
new file mode 100644
index 0000000000000..53daa6c4cd939
--- /dev/null
+++ b/vllm/model_executor/models/yi.py
@@ -0,0 +1,330 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Yi model (https://01.ai) compatible with HuggingFace weights."""
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from vllm.transformers_utils.configs.yi import YiConfig
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.weight_utils import (default_weight_loader,
+                                              hf_model_weights_iterator)
+from vllm.sequence import SamplerOutput
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class YiMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            linear_method=linear_method)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           linear_method=linear_method)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class YiAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   self.scaling,
+                                   num_kv_heads=self.num_kv_heads)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class YiDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: YiConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = YiAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            linear_method=linear_method,
+        )
+        self.mlp = YiMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            linear_method=linear_method,
+        )
+        self.ln1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.ln2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.ln1(hidden_states)
+        else:
+            hidden_states, residual = self.ln1(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.ln2(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class YiModel(nn.Module):
+
+    def __init__(
+        self,
+        config: YiConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            YiDecoderLayer(config, linear_method)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class YiForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: YiConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = YiModel(config, linear_method)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   sampling_metadata)
+        return next_tokens
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     load_format: str = "auto",
+                     revision: Optional[str] = None):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, load_format, revision):
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index bc27784087aa7..8b16e559b24f2 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -5,10 +5,14 @@
 from vllm.transformers_utils.configs import *
 
 _CONFIG_REGISTRY = {
+    "aquila": AquilaConfig,
+    "baichuan": BaiChuanConfig,
     "chatglm": ChatGLMConfig,
     "mpt": MPTConfig,
+    "qwen": QWenConfig,
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
+    "yi": YiConfig,
 }
 
 
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index ef955f75cedaa..284867414e0ed 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,12 +1,20 @@
+from vllm.transformers_utils.configs.aquila import AquilaConfig
+from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
+from vllm.transformers_utils.configs.qwen import QWenConfig
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
+from vllm.transformers_utils.configs.yi import YiConfig
 
 __all__ = [
+    "AquilaConfig",
+    "BaiChuanConfig",
     "ChatGLMConfig",
     "MPTConfig",
+    "QWenConfig",
     "RWConfig",
+    "YiConfig",
 ]
diff --git a/vllm/transformers_utils/configs/aquila.py b/vllm/transformers_utils/configs/aquila.py
new file mode 100644
index 0000000000000..86a6f2ba304af
--- /dev/null
+++ b/vllm/transformers_utils/configs/aquila.py
@@ -0,0 +1,69 @@
+# coding=utf-8
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Aquila model configuration"""
+
+from transformers import PretrainedConfig
+
+
+class AquilaConfig(PretrainedConfig):
+    model_type = "aquila"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=100008,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.006,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/vllm/transformers_utils/configs/baichuan.py b/vllm/transformers_utils/configs/baichuan.py
new file mode 100644
index 0000000000000..869817525c11a
--- /dev/null
+++ b/vllm/transformers_utils/configs/baichuan.py
@@ -0,0 +1,62 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class BaiChuanConfig(PretrainedConfig):
+    model_type = "baichuan"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=64000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/vllm/transformers_utils/configs/qwen.py b/vllm/transformers_utils/configs/qwen.py
new file mode 100644
index 0000000000000..bb033a337ad04
--- /dev/null
+++ b/vllm/transformers_utils/configs/qwen.py
@@ -0,0 +1,60 @@
+# Copyright (c) Alibaba Cloud.
+# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
+
+from transformers import PretrainedConfig
+
+
+class QWenConfig(PretrainedConfig):
+    model_type = "qwen"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        emb_dropout_prob=0.0,
+        attn_dropout_prob=0.0,
+        layer_norm_epsilon=1e-6,
+        initializer_range=0.02,
+        max_position_embeddings=8192,
+        scale_attn_weights=True,
+        use_cache=True,
+        bf16=False,
+        fp16=False,
+        fp32=False,
+        kv_channels=128,
+        rotary_pct=1.0,
+        rotary_emb_base=10000,
+        use_dynamic_ntk=True,
+        use_logn_attn=True,
+        use_flash_attn="auto",
+        intermediate_size=22016,
+        no_bias=True,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.emb_dropout_prob = emb_dropout_prob
+        self.attn_dropout_prob = attn_dropout_prob
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.max_position_embeddings = max_position_embeddings
+        self.bf16 = bf16
+        self.fp16 = fp16
+        self.fp32 = fp32
+        self.kv_channels = kv_channels
+        self.rotary_pct = rotary_pct
+        self.rotary_emb_base = rotary_emb_base
+        self.use_dynamic_ntk = use_dynamic_ntk
+        self.use_logn_attn = use_logn_attn
+        self.use_flash_attn = use_flash_attn
+        self.no_bias = no_bias
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
diff --git a/vllm/transformers_utils/configs/yi.py b/vllm/transformers_utils/configs/yi.py
new file mode 100644
index 0000000000000..359922ed26952
--- /dev/null
+++ b/vllm/transformers_utils/configs/yi.py
@@ -0,0 +1,64 @@
+""" Yi model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+Yi_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class YiConfig(PretrainedConfig):
+    r"""
+        Reference:
+        https://huggingface.co/01-ai/Yi-6B/blob/main/configuration_yi.py
+    """
+    model_type = "Yi"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=64000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=4,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        output_attentions=False,
+        rope_theta=5000000.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.output_attentions = output_attentions
+        self.rope_theta = rope_theta
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

From a463c333dd7905519141abe4f61b63ccc6b739a9 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 13 Feb 2024 11:32:06 -0800
Subject: [PATCH 058/112] Use CuPy for CUDA graphs (#2811)

---
 requirements.txt                              |   1 +
 vllm/engine/llm_engine.py                     |   2 +-
 .../parallel_utils/communication_op.py        |  13 +-
 .../parallel_utils/cupy_utils.py              | 130 ++++++++++++++++++
 .../parallel_utils/parallel_state.py          |  37 +++++
 vllm/test_utils.py                            |   7 +-
 vllm/worker/model_runner.py                   |  52 +++++--
 vllm/worker/worker.py                         |  26 +++-
 8 files changed, 246 insertions(+), 22 deletions(-)
 create mode 100644 vllm/model_executor/parallel_utils/cupy_utils.py

diff --git a/requirements.txt b/requirements.txt
index 5684b2c29634d..b030ae616919b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,3 +12,4 @@ pydantic >= 2.0  # Required for OpenAI server.
 aioprometheus[starlette]
 pynvml == 11.5.0
 triton >= 2.1.0
+cupy-cuda12x == 12.3.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 03a2b1157652b..86f0925209309 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -283,7 +283,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             is_driver_worker=True,
         )
 
-        self._run_workers("init_model")
+        self._run_workers("init_model", cupy_port=get_open_port())
         self._run_workers(
             "load_model",
             max_concurrent_workers=self.parallel_config.
diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py
index 65671994f3309..cf805df892fdc 100644
--- a/vllm/model_executor/parallel_utils/communication_op.py
+++ b/vllm/model_executor/parallel_utils/communication_op.py
@@ -1,14 +1,15 @@
 from collections import namedtuple
 from typing import Any, Dict, List, Optional, Union
 
-from torch.distributed import ProcessGroup
-
 import torch
+from torch.distributed import ProcessGroup
 
+from vllm.model_executor.parallel_utils import cupy_utils
 from vllm.model_executor.parallel_utils.parallel_state import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     get_tensor_model_parallel_group,
+    is_cupy_nccl_enabled_for_all_reduce,
 )
 from vllm.model_executor.parallel_utils.custom_all_reduce import custom_all_reduce
 
@@ -31,8 +32,12 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
     out = custom_all_reduce(input_)
     if out is not None:
         return out
-    torch.distributed.all_reduce(input_,
-                                 group=get_tensor_model_parallel_group())
+    if is_cupy_nccl_enabled_for_all_reduce():
+        # TODO: support multiple parallel groups.
+        cupy_utils.all_reduce(input_)
+    else:
+        torch.distributed.all_reduce(input_,
+                                     group=get_tensor_model_parallel_group())
     return input_
 
 
diff --git a/vllm/model_executor/parallel_utils/cupy_utils.py b/vllm/model_executor/parallel_utils/cupy_utils.py
new file mode 100644
index 0000000000000..f8cffc01e3c36
--- /dev/null
+++ b/vllm/model_executor/parallel_utils/cupy_utils.py
@@ -0,0 +1,130 @@
+"""CuPy utilities for all-reduce.
+
+We use CuPy all-reduce instead of torch.distributed.all_reduce when capturing
+CUDA graphs, because torch.distributed.all_reduce causes errors when capturing
+CUDA graphs.
+
+NOTE: We use CuPy 12.3 since CuPy 13.0 does not support Python 3.8.
+TODO: Remove this file when torch.distributed.all_reduce is fixed.
+"""
+import contextlib
+
+import torch
+from torch.distributed import ReduceOp
+
+try:
+    import cupy
+    from cupy.cuda import nccl
+    from cupyx.distributed import NCCLBackend
+except ImportError as e:
+    cupy = e
+    nccl = None
+
+    class NCCLBackend:
+        ...
+
+
+_OP_MAPPING = {
+    ReduceOp.SUM: "sum",
+    ReduceOp.PRODUCT: "prod",
+    ReduceOp.MIN: "min",
+    ReduceOp.MAX: "max",
+}
+
+
+class NCCLBackendWithBFloat16(NCCLBackend):
+    # This is enough to add bfloat16 support for most operations,
+    # but broadcast will fail (will require changes in compiled
+    # cupy code).
+    def _get_nccl_dtype_and_count(self, array, count=None):
+        nccl_dtype, count = super()._get_nccl_dtype_and_count(array, count)
+        torch_dtype = getattr(array, "_torch_dtype", None)
+        if torch_dtype is torch.bfloat16:
+            nccl_dtype = nccl.NCCL_BFLOAT16
+        return nccl_dtype, count
+
+    def barrier(self) -> None:
+        raise RuntimeError(
+            "Currently, CuPy NCCL barrier is not supported since the TCP "
+            "store is immediately stopped after the initialization.")
+
+
+_NCCL_BACKEND = None
+_WORLD_SIZE = 0
+
+
+def is_initialized() -> bool:
+    """Returns whether the NCCL backend is initialized."""
+    return _NCCL_BACKEND is not None
+
+
+@contextlib.contextmanager
+def set_cupy_stream(stream: torch.cuda.Stream):
+    """Set the cuda stream for communication"""
+    cupy_stream = cupy.cuda.ExternalStream(stream.cuda_stream,
+                                           stream.device_index)
+    with cupy_stream:
+        yield
+
+
+def init_process_group(world_size: int, rank: int, host: str,
+                       port: int) -> None:
+    """Initializes the CuPy NCCL backend.
+
+    # TODO: handle NCCL timeouts.
+    """
+    assert not is_initialized()
+
+    if isinstance(cupy, Exception):
+        raise ImportError(
+            "NCCLBackend is not available. Please install cupy.") from cupy
+
+    # TODO(woosuk): Create TP and PP process groups for CuPy.
+    global _NCCL_BACKEND
+    global _WORLD_SIZE
+    assert world_size > 0, f"{world_size=} should be a positive integer"
+    assert 0 <= rank < world_size, (
+        f"{rank=} should be a integer between [0, {world_size})")
+
+    cupy.cuda.runtime.setDevice(torch.cuda.current_device())
+    _NCCL_BACKEND = NCCLBackendWithBFloat16(world_size, rank, host, port)
+    _WORLD_SIZE = world_size
+
+    # Stop the TCP store to prevent the deadlock issues at termination time.
+    # FIXME(woosuk): This is hacky. Find a more robust solution.
+    if rank == 0 and hasattr(_NCCL_BACKEND, "_store"):
+        _NCCL_BACKEND._store.stop()
+
+
+def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None:
+    """All-reduces the input tensor across the process group."""
+    assert input_.is_cuda, f"{input_} should be a cuda tensor"
+    # Hack to support bfloat16
+    torch_dtype = input_.dtype
+    if torch_dtype is torch.bfloat16:
+        # We need to view as float16, otherwise
+        # cupy will fail. This will not change
+        # the underlying data.
+        input_ = input_.view(torch.float16)
+    cupy_input = cupy.asarray(input_)
+    cupy_input._torch_dtype = torch_dtype  # pylint: disable=protected-access
+    _NCCL_BACKEND.all_reduce(in_array=cupy_input,
+                             out_array=cupy_input,
+                             op=_OP_MAPPING[op])
+
+
+def destroy_process_group() -> None:
+    """Destroys the NCCL backend."""
+    global _NCCL_BACKEND
+    global _WORLD_SIZE
+    _NCCL_BACKEND = None
+    _WORLD_SIZE = 0
+
+
+def get_world_size() -> int:
+    """Returns the world size."""
+    return _WORLD_SIZE
+
+
+def get_nccl_backend():
+    return _NCCL_BACKEND
diff --git a/vllm/model_executor/parallel_utils/parallel_state.py b/vllm/model_executor/parallel_utils/parallel_state.py
index 59cc196538571..aeb07f64c37dc 100644
--- a/vllm/model_executor/parallel_utils/parallel_state.py
+++ b/vllm/model_executor/parallel_utils/parallel_state.py
@@ -3,9 +3,12 @@
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 """Tensor and pipeline parallel groups."""
+import contextlib
 
 import torch
 
+from vllm.model_executor.parallel_utils import cupy_utils
+
 # Tensor model parallel group that the current rank belongs to.
 _TENSOR_MODEL_PARALLEL_GROUP = None
 # Pipeline model parallel group that the current rank belongs to.
@@ -206,3 +209,37 @@ def destroy_model_parallel():
     _PIPELINE_MODEL_PARALLEL_GROUP = None
     global _PIPELINE_GLOBAL_RANKS
     _PIPELINE_GLOBAL_RANKS = None
+
+    # Destroy the cupy states if any.
+    cupy_utils.destroy_process_group()
+
+
+# Whether to use cupy for nccl all reduce.
+# We use cupy for all reduce when using CUDA graph, because torch.distributed
+# is not well supported by CUDA graph.
+_ENABLE_CUPY_FOR_ALL_REDUCE = False
+
+
+@contextlib.contextmanager
+def with_cupy_nccl_for_all_reduce():
+    """use CuPy nccl instead of torch.distributed for all reduce"""
+    tp_size = get_tensor_model_parallel_world_size()
+    if tp_size == 1:
+        # No-op.
+        # NOTE(woosuk): We don't initialize CuPy when tp_size is 1.
+        yield
+    else:
+        global _ENABLE_CUPY_FOR_ALL_REDUCE
+        old = _ENABLE_CUPY_FOR_ALL_REDUCE
+        _ENABLE_CUPY_FOR_ALL_REDUCE = True
+
+        stream = torch.cuda.current_stream()
+        with cupy_utils.set_cupy_stream(stream):
+            yield
+        _ENABLE_CUPY_FOR_ALL_REDUCE = old
+
+
+def is_cupy_nccl_enabled_for_all_reduce():
+    """check if CuPy nccl is enabled for all reduce"""
+    global _ENABLE_CUPY_FOR_ALL_REDUCE
+    return _ENABLE_CUPY_FOR_ALL_REDUCE
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
index 4f74c05038e70..75bf6ce373d93 100644
--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
@@ -15,8 +15,11 @@ def init_test_distributed_environment(
                                      tensor_parallel_size,
                                      worker_use_ray=True)
     distributed_init_method = f"tcp://localhost:{distributed_init_port}"
-    init_distributed_environment(parallel_config, rank,
-                                 distributed_init_method)
+    init_distributed_environment(
+        parallel_config,
+        rank,
+        cupy_port=None,
+        distributed_init_method=distributed_init_method)
 
 
 def multi_process_tensor_parallel(
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index fce0009e3097d..62f7530868ade 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -5,11 +5,15 @@
 import torch
 import torch.nn as nn
 
-from vllm.config import DeviceConfig, ModelConfig, LoRAConfig, ParallelConfig, SchedulerConfig
+from vllm.config import (DeviceConfig, ModelConfig, LoRAConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.model_executor import get_model, InputMetadata, SamplingMetadata
 from vllm.model_executor.parallel_utils.communication_op import (
     broadcast_tensor_dict)
+from vllm.model_executor.parallel_utils.cupy_utils import get_nccl_backend
+from vllm.model_executor.parallel_utils.parallel_state import (
+    with_cupy_nccl_for_all_reduce)
 from vllm.model_executor.parallel_utils import custom_all_reduce
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
@@ -644,6 +648,10 @@ def list_loras(self) -> Set[int]:
 
     @torch.inference_mode()
     def capture_model(self, kv_caches: List[KVCache]) -> None:
+        # NOTE(woosuk): This is a hack to ensure that the NCCL backend is never
+        # deleted before the CUDA graphs.
+        self.cupy_nccl_backend = get_nccl_backend()
+
         assert not self.model_config.enforce_eager
         logger.info("Capturing the model for CUDA graphs. This may lead to "
                     "unexpected consequences if the model is not static. To "
@@ -674,6 +682,12 @@ def capture_model(self, kv_caches: List[KVCache]) -> None:
 
         # NOTE: Capturing the largest batch size first may help reduce the
         # memory usage of CUDA graph.
+        # NOTE(woosuk): There are 3 backends for all-reduce: custom all-reduce
+        # kernel, CuPy NCCL, and PyTorch NCCL. When using CUDA graph, we use
+        # either custom all-reduce kernel or CuPy NCCL. When not using CUDA
+        # graph, we use either custom all-reduce kernel or PyTorch NCCL.
+        # We always prioritize using custom all-reduce kernel but fall back
+        # to PyTorch or CuPy NCCL if it is disabled or not supported.
         with custom_all_reduce.capture():
             for batch_size in reversed(batch_size_capture_list):
                 # Create dummy input_metadata.
@@ -713,6 +727,14 @@ def capture_model(self, kv_caches: List[KVCache]) -> None:
         # This usually takes < 10 seconds.
         logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs.")
 
+    def __del__(self) -> None:
+        # Delete the CUDA graphs before deleting the CuPy NCCL communicator.
+        # NOTE(woosuk): This is necessary because otherwise deadlocks can
+        # happen.
+        # FIXME(woosuk): This is a bit hacky. Find a more robust solution.
+        self.graph_runners.clear()
+        self.cupy_nccl_backend = None
+
 
 class CUDAGraphRunner:
 
@@ -734,18 +756,8 @@ def capture(
         # Run the model once without capturing the graph.
         # This is to make sure that the captured graph does not include the
         # kernel launches for initial benchmarking (e.g., Triton autotune).
-        self.model(
-            input_ids,
-            positions,
-            kv_caches,
-            input_metadata,
-        )
-        torch.cuda.synchronize()
-
-        # Capture the graph.
-        self.graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(self.graph, pool=memory_pool):
-            hidden_states = self.model(
+        with with_cupy_nccl_for_all_reduce():
+            self.model(
                 input_ids,
                 positions,
                 kv_caches,
@@ -753,6 +765,20 @@ def capture(
             )
         torch.cuda.synchronize()
 
+        # Capture the graph.
+        # NOTE(woosuk): Python 3.8 does not support multi-line with statements.
+        # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
+        self.graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(self.graph, pool=memory_pool):  # noqa: SIM117
+            with with_cupy_nccl_for_all_reduce():
+                hidden_states = self.model(
+                    input_ids,
+                    positions,
+                    kv_caches,
+                    input_metadata,
+                )
+        torch.cuda.synchronize()
+
         # Save the input and output buffers.
         self.input_buffers = {
             "input_ids": input_ids,
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index c97e82a55a1ee..b616040367c84 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -9,6 +9,7 @@
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, LoRAConfig)
 from vllm.model_executor import set_random_seed
+from vllm.model_executor.parallel_utils import cupy_utils
 from vllm.model_executor.parallel_utils.communication_op import (
     broadcast_tensor_dict)
 from vllm.model_executor.parallel_utils.custom_all_reduce import init_custom_ar
@@ -67,7 +68,7 @@ def __init__(
         self.cache_events = None
         self.gpu_cache = None
 
-    def init_model(self) -> None:
+    def init_model(self, cupy_port: Optional[int] = None) -> None:
         if self.device_config.device.type == "cuda":
             # torch.distributed.all_reduce does not free the input tensor until
             # the synchronization point. This causes the memory usage to grow
@@ -88,7 +89,7 @@ def init_model(self) -> None:
                 f"Not support device type: {self.device_config.device}")
         # Initialize the distributed environment.
         init_distributed_environment(self.parallel_config, self.rank,
-                                     self.distributed_init_method)
+                                     cupy_port, self.distributed_init_method)
         if not self.parallel_config.disable_custom_all_reduce:
             init_custom_ar()
         # Initialize the model.
@@ -233,6 +234,7 @@ def list_loras(self) -> Set[int]:
 def init_distributed_environment(
     parallel_config: ParallelConfig,
     rank: int,
+    cupy_port: Optional[int],
     distributed_init_method: Optional[str] = None,
 ) -> None:
     """Initialize the distributed environment."""
@@ -255,8 +257,28 @@ def init_distributed_environment(
             init_method=distributed_init_method,
         )
 
+    if cupy_utils.is_initialized():
+        cupy_world_size = cupy_utils.get_world_size()
+        if cupy_world_size != parallel_config.world_size:
+            raise RuntimeError(
+                "cupy.distributed is already initialized but the cupy world "
+                "size does not match parallel_config.world_size "
+                f"({cupy_world_size} vs. {parallel_config.world_size}).")
+    elif parallel_config.world_size > 1 and cupy_port is not None:
+        # NOTE(woosuk): We don't initialize CuPy process group when world size
+        # is 1.
+        # TODO(woosuk): Support multi-node connection.
+        cupy_utils.init_process_group(
+            world_size=parallel_config.world_size,
+            rank=rank,
+            host="localhost",
+            port=cupy_port,
+        )
+
     # A small all_reduce for warmup.
     torch.distributed.all_reduce(torch.zeros(1).cuda())
+    if cupy_utils.is_initialized():
+        cupy_utils.all_reduce(torch.zeros(1).cuda())
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
 

From 317b29de0f16428610e2e4d6a6953bee5a2d0ec2 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Tue, 13 Feb 2024 14:22:22 -0800
Subject: [PATCH 059/112] Remove Yi model definition, please use
 `LlamaForCausalLM` instead (#2854)

Co-authored-by: Roy <jasonailu87@gmail.com>
---
 docs/source/models/supported_models.rst     |   7 +-
 vllm/model_executor/models/yi.py            | 330 --------------------
 vllm/transformers_utils/config.py           |   1 -
 vllm/transformers_utils/configs/__init__.py |   2 -
 vllm/transformers_utils/configs/yi.py       |  64 ----
 5 files changed, 2 insertions(+), 402 deletions(-)
 delete mode 100644 vllm/model_executor/models/yi.py
 delete mode 100644 vllm/transformers_utils/configs/yi.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index a806aa4e29452..5d7f401cc6e2c 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -51,8 +51,8 @@ Alongside each architecture, we include some popular models that use it.
     - InternLM2
     - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc.
   * - :code:`LlamaForCausalLM`
-    - LLaMA, LLaMA-2, Vicuna, Alpaca, Koala, Guanaco
-    - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`young-geng/koala`, etc.
+    - LLaMA, LLaMA-2, Vicuna, Alpaca, Yi
+    - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
   * - :code:`MistralForCausalLM`
     - Mistral, Mistral-Instruct
     - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc.
@@ -77,9 +77,6 @@ Alongside each architecture, we include some popular models that use it.
   * - :code:`StableLMEpochForCausalLM`
     - StableLM
     - :code:`stabilityai/stablelm-3b-4e1t/` , :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc.
-  * - :code:`YiForCausalLM`
-    - Yi
-    - :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
diff --git a/vllm/model_executor/models/yi.py b/vllm/model_executor/models/yi.py
deleted file mode 100644
index 53daa6c4cd939..0000000000000
--- a/vllm/model_executor/models/yi.py
+++ /dev/null
@@ -1,330 +0,0 @@
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only Yi model (https://01.ai) compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional, Tuple
-
-import torch
-from torch import nn
-from vllm.transformers_utils.configs.yi import YiConfig
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
-from vllm.sequence import SamplerOutput
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class YiMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           linear_method=linear_method)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class YiAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        if self.total_num_kv_heads >= tp_size:
-            # Number of KV heads is greater than TP size, so we partition
-            # the KV heads across multiple tensor parallel GPUs.
-            assert self.total_num_kv_heads % tp_size == 0
-        else:
-            # Number of KV heads is less than TP size, so we replicate
-            # the KV heads across multiple tensor parallel GPUs.
-            assert tp_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=False,
-            linear_method=linear_method,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            linear_method=linear_method,
-        )
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class YiDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: YiConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = YiAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            linear_method=linear_method,
-        )
-        self.mlp = YiMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            linear_method=linear_method,
-        )
-        self.ln1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.ln2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.ln1(hidden_states)
-        else:
-            hidden_states, residual = self.ln1(hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.ln2(hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-
-
-class YiModel(nn.Module):
-
-    def __init__(
-        self,
-        config: YiConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            YiDecoderLayer(config, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                residual,
-            )
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-
-class YiForCausalLM(nn.Module):
-
-    def __init__(
-        self,
-        config: YiConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.linear_method = linear_method
-        self.model = YiModel(config, linear_method)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata)
-        return hidden_states
-
-    def sample(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   sampling_metadata)
-        return next_tokens
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 8b16e559b24f2..1660fe181d2cc 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -12,7 +12,6 @@
     "qwen": QWenConfig,
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
-    "yi": YiConfig,
 }
 
 
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 284867414e0ed..8a3aec9efc572 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -7,7 +7,6 @@
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
-from vllm.transformers_utils.configs.yi import YiConfig
 
 __all__ = [
     "AquilaConfig",
@@ -16,5 +15,4 @@
     "MPTConfig",
     "QWenConfig",
     "RWConfig",
-    "YiConfig",
 ]
diff --git a/vllm/transformers_utils/configs/yi.py b/vllm/transformers_utils/configs/yi.py
deleted file mode 100644
index 359922ed26952..0000000000000
--- a/vllm/transformers_utils/configs/yi.py
+++ /dev/null
@@ -1,64 +0,0 @@
-""" Yi model configuration"""
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-Yi_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-
-
-class YiConfig(PretrainedConfig):
-    r"""
-        Reference:
-        https://huggingface.co/01-ai/Yi-6B/blob/main/configuration_yi.py
-    """
-    model_type = "Yi"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=64000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=4,
-        hidden_act="silu",
-        max_position_embeddings=4096,
-        initializer_range=0.02,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        output_attentions=False,
-        rope_theta=5000000.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.output_attentions = output_attentions
-        self.rope_theta = rope_theta
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )

From 2a543d6efecc4e0fe391cbccb68d99ab42e37c33 Mon Sep 17 00:00:00 2001
From: Terry <149540247+tterrysun@users.noreply.github.com>
Date: Tue, 13 Feb 2024 15:55:45 -0800
Subject: [PATCH 060/112] Add LoRA support for Mixtral (#2831)

* add mixtral lora support

* formatting

* fix incorrectly ported logic

* polish tests

* minor fixes and refactoring

* minor fixes

* formatting

* rename and remove redundant logic

* refactoring

* refactoring

* minor fix

* minor refactoring

* fix code smell
---
 tests/lora/conftest.py                |  5 ++
 tests/lora/test_lora_manager.py       | 82 +++++++++++++----------
 tests/lora/test_mixtral.py            | 53 +++++++++++++++
 vllm/lora/models.py                   | 96 +++++++++------------------
 vllm/lora/worker_manager.py           | 21 +++---
 vllm/model_executor/model_loader.py   |  2 +-
 vllm/model_executor/models/llama.py   | 35 ++++++++--
 vllm/model_executor/models/mistral.py | 27 +++++++-
 vllm/model_executor/models/mixtral.py | 40 ++++++++++-
 vllm/worker/model_runner.py           | 11 ++-
 10 files changed, 251 insertions(+), 121 deletions(-)
 create mode 100644 tests/lora/test_mixtral.py

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 163c3c70261c0..0ca0715334c25 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -121,6 +121,11 @@ def sql_lora_files():
     return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
 
 
+@pytest.fixture(scope="session")
+def mixtral_lora_files():
+    return snapshot_download(repo_id="terrysun/mixtral-lora-adapter")
+
+
 @pytest.fixture
 def llama_2_7b_engine_extra_embeddings() -> nn.Module:
     cleanup()
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 78a4a5bc5ecd2..2d4fc085b719b 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -11,25 +11,35 @@
                               RowParallelLinearWithLoRA,
                               MergedColumnParallelLinearWithLoRA)
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.models import (EMBEDDING_MODULES, LoRAModel, LoRAModelManager,
+from vllm.lora.models import (LoRAModel, LoRAModelManager,
                               LRUCacheLoRAModelManager, LoRAMapping)
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
                                       WorkerLoRAManager)
 from vllm.model_executor.layers.linear import RowParallelLinear
 
+EMBEDDING_MODULES = {
+    "embed_tokens": "input_embeddings",
+    "lm_head": "output_embeddings",
+}
+
+EMBEDDING_PADDING_MODULES = ["lm_head"]
+
 
 def test_from_lora_tensors(sql_lora_files):
     tensors = load_file(
         os.path.join(sql_lora_files, "adapter_model.safetensors"))
     new_embeddings = load_file(
         os.path.join(sql_lora_files, "new_embeddings.safetensors"))
-    lora_model = LoRAModel.from_lora_tensors(1,
-                                             8,
-                                             16,
-                                             tensors,
-                                             "cuda",
-                                             embeddings=new_embeddings)
+    lora_model = LoRAModel.from_lora_tensors(
+        1,
+        8,
+        16,
+        tensors,
+        "cuda",
+        embeddings=new_embeddings,
+        embedding_modules=EMBEDDING_MODULES,
+        embedding_padding_modules=EMBEDDING_PADDING_MODULES)
     for module_name, lora in lora_model.loras.items():
         assert lora.module_name == module_name
         assert lora.rank == 8
@@ -90,14 +100,11 @@ def create_packed_lora(
 
 def test_replace_submodules(dist_init, dummy_model):
     model = dummy_model
-    manager = LoRAModelManager(model,
-                               1,
-                               1,
-                               1,
-                               LoRAConfig(max_lora_rank=8,
-                                          max_cpu_loras=8,
-                                          max_loras=8),
-                               lora_target_modules=["dense1", "layer1.dense2"])
+    model.supported_lora_modules = ["dense1", "layer1.dense2"]
+    model.packed_modules_mapping = {}
+    manager = LoRAModelManager(
+        model, 1, 1, 1,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8))
     model = manager.model
 
     assert isinstance(model.get_submodule("dense1"),
@@ -111,16 +118,14 @@ def test_replace_submodules(dist_init, dummy_model):
 
 def test_lora_model_manager(dist_init, dummy_model):
     model = dummy_model
+    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
+    model.packed_modules_mapping = {}
     model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
     model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
     model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
     manager = LoRAModelManager(
-        model,
-        2,
-        2,
-        2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2),
-        lora_target_modules=["dense1", "dense2", "lm_head"])
+        model, 2, 2, 2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
     assert all(x is None for x in manager.lora_index_to_id)
     assert manager.add_lora(model_lora1)
     assert manager.activate_lora(1)
@@ -159,16 +164,14 @@ def test_lora_model_manager(dist_init, dummy_model):
 
 def test_lora_lru_cache_model_manager(dist_init, dummy_model):
     model = dummy_model
+    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
+    model.packed_modules_mapping = {}
     model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
     model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
     model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
     manager = LRUCacheLoRAModelManager(
-        model,
-        2,
-        2,
-        2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2),
-        lora_target_modules=["dense1", "dense2", "lm_head"])
+        model, 2, 2, 2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
     assert all(x is None for x in manager.lora_index_to_id)
     assert manager.add_lora(model_lora1)
     assert manager.activate_lora(1)
@@ -212,14 +215,15 @@ def test_lru_lora_model_manager(dist_init, dummy_model):
     # This tests just the LRU cache functionality, everything else is
     # tested in test_lora_model_manager
     model = dummy_model
+    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
+    model.packed_modules_mapping = {}
     model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
     model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
     model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
     model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"])
     manager = LRUCacheLoRAModelManager(
         model, 2, 2, 2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2),
-        ["dense1", "dense2", "lm_head"])
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))
 
     assert all(x is None for x in manager.lora_index_to_id)
 
@@ -289,8 +293,9 @@ def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
                                        sql_lora_files):
     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
     worker_lora_manager = LRUCacheWorkerLoRAManager(
-        4, 2, llama_2_7b_model_extra_embeddings.config.vocab_size, lora_config,
-        torch.device("cuda"))
+        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
+        lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
+        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
     worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
 
     mapping = LoRAMapping([], [])
@@ -362,8 +367,9 @@ def test_worker_lora_manager(llama_2_7b_model_extra_embeddings,
     # Should remove every LoRA not specified in the request.
     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
     worker_lora_manager = WorkerLoRAManager(
-        4, 2, llama_2_7b_model_extra_embeddings.config.vocab_size, lora_config,
-        torch.device("cuda"))
+        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
+        lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
+        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
     worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
 
     mapping = LoRAMapping([], [])
@@ -428,6 +434,13 @@ def test_worker_lora_manager(llama_2_7b_model_extra_embeddings,
 
 def test_packed_loras(dist_init, dummy_model_gate_up):
     model = dummy_model_gate_up
+    model.supported_lora_modules = ["gate_up_proj"]
+    model.packed_modules_mapping = {
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
     model_lora = create_packed_lora(
         1,
         model,
@@ -443,8 +456,7 @@ def test_packed_loras(dist_init, dummy_model_gate_up):
 
     manager = LoRAModelManager(
         model, 2, 2, 2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2),
-        ["gate_up_proj"])
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))
     model = manager.model
 
     assert isinstance(model.get_submodule("gate_up_proj"),
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
new file mode 100644
index 0000000000000..e45fb92ab7edf
--- /dev/null
+++ b/tests/lora/test_mixtral.py
@@ -0,0 +1,53 @@
+import pytest
+import torch
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+
+
+def do_sample(llm, lora_path: str, lora_id: int):
+    prompts = [
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.parametrize("tp_size", [4])
+def test_mixtral_lora(mixtral_lora_files, tp_size):
+    if torch.cuda.device_count() < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+
+    llm = vllm.LLM(MODEL_PATH,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=4,
+                   tensor_parallel_size=tp_size,
+                   worker_use_ray=True)
+
+    expected_lora_output = [
+        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",
+        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",
+        "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",
+    ]
+
+    assert do_sample(llm, mixtral_lora_files,
+                     lora_id=1) == expected_lora_output
+    assert do_sample(llm, mixtral_lora_files,
+                     lora_id=2) == expected_lora_output
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 6c78c4a2c7771..7386d21c58e4e 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -4,8 +4,7 @@
 import math
 import os
 import re
-from typing import (Any, Callable, Dict, Hashable, List, Optional, Tuple, Type,
-                    Union)
+from typing import (Any, Callable, Dict, Hashable, List, Optional, Tuple, Type)
 
 import safetensors.torch
 import torch
@@ -20,36 +19,6 @@
 
 logger = logging.getLogger(__name__)
 
-# TODO: The mappings below should be moved to individual model classes.
-
-PACKED_MODULES_CFG = {
-    "qkv_proj": [
-        "q_proj",
-        "k_proj",
-        "v_proj",
-    ],
-    "gate_up_proj": [
-        "gate_proj",
-        "up_proj",
-    ],
-}
-
-TARGET_MODULES_QKV = [
-    "qkv_proj",
-    "o_proj",
-    "gate_up_proj",
-    "down_proj",
-    "embed_tokens",
-    "lm_head",
-]
-
-EMBEDDING_MODULES = {
-    "embed_tokens": "input_embeddings",
-    "lm_head": "output_embeddings",
-}
-
-EMBEDDING_PADDING_MODULES = ["lm_head"]
-
 _GLOBAL_LORA_ID = 0
 
 
@@ -169,6 +138,8 @@ def from_lora_tensors(
         dtype: Optional[torch.dtype] = None,
         embeddings: Optional[Dict[str, torch.Tensor]] = None,
         target_embedding_padding: Optional[int] = None,
+        embedding_modules: Optional[Dict[str, str]] = None,
+        embedding_padding_modules: Optional[List[str]] = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a dictionary of tensors."""
         pin_memory = str(device) == "cpu" and not in_wsl()
@@ -179,11 +150,11 @@ def from_lora_tensors(
                 lora_embeddings_tensor = None
                 if embeddings:
                     embeddings_module = next(
-                        (k for k in EMBEDDING_MODULES if k in module_name),
+                        (k for k in embedding_modules if k in module_name),
                         None)
                     if embeddings_module:
                         lora_embeddings_tensor = embeddings[
-                            EMBEDDING_MODULES[embeddings_module]].to(
+                            embedding_modules[embeddings_module]].to(
                                 device=device, dtype=dtype)
                         if pin_memory:
                             lora_embeddings_tensor = (
@@ -201,7 +172,7 @@ def from_lora_tensors(
                 loras[module_name].lora_b = tensor.to(device=device,
                                                       dtype=dtype).t()
                 if any(name in module_name
-                       for name in EMBEDDING_PADDING_MODULES
+                       for name in embedding_padding_modules
                        ) and target_embedding_padding is not None:
                     lora_b = loras[module_name].lora_b
                     assert target_embedding_padding >= lora_b.shape[1]
@@ -218,12 +189,15 @@ def from_lora_tensors(
 
     @classmethod
     def from_local_checkpoint(
-            cls,
-            lora_dir: str,
-            lora_model_id: Optional[int] = None,
-            device: str = "cuda",
-            dtype: Optional[torch.dtype] = None,
-            target_embedding_padding: Optional[int] = None) -> "LoRAModel":
+        cls,
+        lora_dir: str,
+        lora_model_id: Optional[int] = None,
+        device: str = "cuda",
+        dtype: Optional[torch.dtype] = None,
+        target_embedding_padding: Optional[int] = None,
+        embedding_modules: Optional[Dict[str, str]] = None,
+        embedding_padding_modules: Optional[List[str]] = None,
+    ) -> "LoRAModel":
         """Create a LoRAModel from a local checkpoint."""
         lora_config_path = os.path.join(lora_dir, "adapter_config.json")
         lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
@@ -260,6 +234,8 @@ def from_local_checkpoint(
             dtype=dtype,
             embeddings=embeddings,
             target_embedding_padding=target_embedding_padding,
+            embedding_modules=embedding_modules,
+            embedding_padding_modules=embedding_padding_modules,
         )
 
 
@@ -273,8 +249,6 @@ def __init__(
         max_num_batched_tokens: int,
         vocab_size: int,
         lora_config: LoRAConfig,
-        lora_target_modules: Union[str, List[str]] = TARGET_MODULES_QKV,
-        packed_modules_mapping: Dict[str, List[str]] = PACKED_MODULES_CFG,
     ):
         """Create a LoRAModelManager and adapter for a given model.
 
@@ -286,13 +260,6 @@ def __init__(
                 in a single batch.
             vocab_size: the vocab size of the model.
             lora_config: the LoRA configuration.
-            lora_target_modules: the target modules patterns to be adapted.
-                Support both single module name and a list of module names.
-            packed_modules_mapping: the mapping for packed modules. vLLM
-                packs some modules into one module, e.g., qkv_proj
-                is packed of q_proj, k_proj, and v_proj. These modules
-                have a single layer in the original model, but they are split
-                into multiple layers in the adapted model.
         """
         self.lora_config = lora_config
         self.max_num_seqs = max_num_seqs
@@ -320,11 +287,11 @@ def __init__(
         self.indices_len = [None] * 4
 
         self.model: nn.Module = model
-        self.lora_target_modules: List[str] = ([
-            lora_target_modules
-        ] if isinstance(lora_target_modules, str) else lora_target_modules)
-        self.lora_target_modules = copy.deepcopy(lora_target_modules)
-        self.packed_modules_mapping = copy.deepcopy(packed_modules_mapping)
+        if hasattr(self.model, "supported_lora_modules"):
+            self.supported_lora_modules = copy.deepcopy(
+                self.model.supported_lora_modules)
+            self.packed_modules_mapping = copy.deepcopy(
+                self.model.packed_modules_mapping)
         self.packed_modules: Dict[str, List[str]] = {}
         self.modules: Dict[str, "BaseLayerWithLoRA"] = {}
         self._registered_loras: Dict[int, LoRAModel] = {}
@@ -468,7 +435,11 @@ def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
         assert isinstance(module, BaseLayerWithLoRA)
         self.modules[module_name] = module
 
-    def create_dummy_lora(self, lora_id: int, rank: int) -> LoRAModel:
+    def create_dummy_lora(
+            self,
+            lora_id: int,
+            rank: int,
+            embedding_modules: Optional[Dict[str, str]] = None) -> LoRAModel:
         """Create zero-initialized LoRAModel for warmup."""
         model = LoRAModel(lora_id, rank, {})
         for module_name, module in self.model.named_modules():
@@ -477,7 +448,7 @@ def create_dummy_lora(self, lora_id: int, rank: int) -> LoRAModel:
                 continue
             parts = module_name.split(".")
             if module_name not in self.packed_modules:
-                if parts[-1] in EMBEDDING_MODULES:
+                if parts[-1] in embedding_modules:
                     input_dim = (module.base_layer.org_vocab_size +
                                  self.lora_config.lora_extra_vocab_size if
                                  hasattr(module.base_layer, "org_vocab_size")
@@ -531,7 +502,7 @@ def _match_target_modules(self, module_name: str):
             re.match(
                 r".*\.{target_module}$".format(target_module=target_module),
                 module_name) or target_module == module_name
-            for target_module in self.lora_target_modules)
+            for target_module in self.supported_lora_modules)
 
     def _register_packed_modules(self, module_full_name: str) -> None:
         parts = module_full_name.split(".")
@@ -586,12 +557,9 @@ def __init__(
         max_num_batched_tokens: int,
         vocab_size: int,
         lora_config: LoRAConfig,
-        lora_target_modules: Union[str, List[str]] = TARGET_MODULES_QKV,
-        packed_modules_mapping: Dict[str, List[str]] = PACKED_MODULES_CFG,
     ):
         super().__init__(model, max_num_seqs, max_num_batched_tokens,
-                         vocab_size, lora_config, lora_target_modules,
-                         packed_modules_mapping)
+                         vocab_size, lora_config)
         self._registered_loras: LoRALRUCache = LoRALRUCache(
             self.capacity, self.deactivate_lora)
         self._active_loras: LoRALRUCache = LoRALRUCache(
@@ -637,11 +605,10 @@ def create_lora_manager(
         max_num_batched_tokens: int,
         vocab_size: int,
         lora_config: LoRAConfig,
-        target_modules: Union[str, List[str]] = TARGET_MODULES_QKV,
         lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager,
         **kwargs) -> LoRAModelManager:
     """Create a LoRA adapter for a given model."""
-    if not getattr(model, "supports_lora", False):
+    if not hasattr(model, "supported_lora_modules"):
         raise ValueError(f"Model {type(model)} is not supported for LoRA.")
     lora_manager = lora_manager_cls(
         model=model,
@@ -649,6 +616,5 @@ def create_lora_manager(
         max_num_batched_tokens=max_num_batched_tokens,
         vocab_size=vocab_size,
         lora_config=lora_config,
-        lora_target_modules=target_modules,
         **kwargs)
     return lora_manager
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index a507c08588dad..7e92bc93ab472 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -1,10 +1,10 @@
 import logging
 from abc import ABC, abstractmethod, abstractproperty
-from typing import Any, List, Optional, Set, Type, Union
+from typing import Any, Dict, List, Optional, Set, Type
 
 import torch
 
-from vllm.lora.models import (TARGET_MODULES_QKV, LoRAModel, LoRAModelManager,
+from vllm.lora.models import (LoRAModel, LoRAModelManager,
                               LRUCacheLoRAModelManager, create_lora_manager)
 from vllm.lora.request import LoRARequest
 from vllm.lora.layers import LoRAMapping
@@ -13,7 +13,7 @@
 logger = logging.getLogger(__name__)
 
 
-class WorkerLoRAManager(ABC):
+class AbstractWorkerLoRAManager(ABC):
     """Abstract class for managing LoRA models on the worker side."""
 
     def __init__(self, max_num_seqs: int, max_num_batched_tokens: int,
@@ -33,7 +33,6 @@ def is_enabled(self) -> bool:
     def create_lora_manager(
         self,
         model: torch.nn.Module,
-        target_modules: Union[str, List[str]] = TARGET_MODULES_QKV,
     ) -> Any:
         ...
 
@@ -63,7 +62,7 @@ def list_loras(self) -> Set[int]:
         ...
 
 
-class WorkerLoRAManager(WorkerLoRAManager):
+class WorkerLoRAManager(AbstractWorkerLoRAManager):
     """WorkerLoRAManager that manages LoRA models on the worker side.
 
     Every request, the requested LoRAs will be loaded (unless they are already
@@ -78,10 +77,14 @@ def __init__(
         vocab_size: int,
         lora_config: LoRAConfig,
         device: torch.device,
+        embedding_modules: Dict[str, str],
+        embedding_padding_modules: List[str],
         lora_model_cls: Type[LoRAModel] = LoRAModel,
     ):
         self._lora_manager: Optional[LoRAModelManager] = None
         self._lora_model_cls = lora_model_cls
+        self.embedding_modules = embedding_modules
+        self.embedding_padding_modules = embedding_padding_modules
         super().__init__(max_num_seqs, max_num_batched_tokens, vocab_size,
                          lora_config, device)
 
@@ -92,13 +95,11 @@ def is_enabled(self) -> bool:
     def create_lora_manager(
         self,
         model: torch.nn.Module,
-        target_modules: Union[str, List[str]] = TARGET_MODULES_QKV,
     ) -> Any:
         lora_manager = create_lora_manager(
             model,
             max_num_seqs=self.max_num_seqs,
             max_num_batched_tokens=self.max_num_batched_tokens,
-            target_modules=target_modules,
             vocab_size=self.vocab_size,
             lora_config=self.lora_config,
             lora_manager_cls=self._lora_manager_cls,
@@ -142,6 +143,8 @@ def _load_lora(self, lora_request: LoRARequest) -> LoRAModel:
                 dtype=self.lora_config.lora_dtype,
                 target_embedding_padding=self.vocab_size +
                 self.lora_config.lora_extra_vocab_size,
+                embedding_modules=self.embedding_modules,
+                embedding_padding_modules=self.embedding_padding_modules,
             )
         except Exception as e:
             raise RuntimeError(
@@ -162,7 +165,7 @@ def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
             return False
         return self._lora_manager.add_lora(
             self._lora_manager.create_dummy_lora(lora_request.lora_int_id,
-                                                 rank))
+                                                 rank, self.embedding_modules))
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         if lora_request.lora_int_id in self.list_loras():
@@ -195,11 +198,9 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
     def create_lora_manager(
         self,
         model: torch.nn.Module,
-        target_modules: Union[str, List[str]] = TARGET_MODULES_QKV,
     ) -> Any:
         lora_manager = create_lora_manager(
             model,
-            target_modules=target_modules,
             lora_manager_cls=self._lora_manager_cls,
             max_num_seqs=self.max_num_seqs,
             vocab_size=self.vocab_size,
diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
index 4b1e13d9e9e0a..ebe092b5d62ba 100644
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@@ -66,7 +66,7 @@ def get_model(model_config: ModelConfig,
         # Create a model instance.
         # The weights will be initialized as empty tensors.
         with torch.device(device_config.device):
-            if getattr(model_class, "supports_lora", False):
+            if hasattr(model_class, "supported_lora_modules"):
                 model = model_class(model_config.hf_config, linear_method,
                                     lora_config)
             elif lora_config:
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e5a1abebf1420..860a8f267acf8 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -269,7 +269,32 @@ def forward(
 
 
 class LlamaForCausalLM(nn.Module):
-    supports_lora = True
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
 
     def __init__(
         self,
@@ -281,11 +306,11 @@ def __init__(
         self.config = config
         self.linear_method = linear_method
         self.model = LlamaModel(config, linear_method, lora_config=lora_config)
-        unpadded_vocab_size = config.vocab_size
+        self.unpadded_vocab_size = config.vocab_size
         if lora_config:
-            unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.lm_head = ParallelLMHead(
-            unpadded_vocab_size,
+            self.unpadded_vocab_size,
             config.hidden_size,
             org_num_embeddings=config.vocab_size,
             padding_size=DEFAULT_VOCAB_PADDING_SIZE
@@ -293,7 +318,7 @@ def __init__(
             # compatibility
             if not lora_config else lora_config.lora_vocab_padding_size,
         )
-        self.sampler = Sampler(unpadded_vocab_size, config.vocab_size)
+        self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/mistral.py b/vllm/model_executor/models/mistral.py
index 01cde67844122..2347ed752d781 100644
--- a/vllm/model_executor/models/mistral.py
+++ b/vllm/model_executor/models/mistral.py
@@ -265,7 +265,32 @@ def forward(
 
 
 class MistralForCausalLM(nn.Module):
-    supports_lora = True
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
 
     def __init__(
         self,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index aeb9d087e954a..6cb1d84965ecf 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -27,6 +27,7 @@
 from torch import nn
 from transformers import MixtralConfig
 
+from vllm.config import LoRAConfig
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.fused_moe import fused_moe
@@ -38,7 +39,7 @@
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead)
+    VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE)
 from vllm.model_executor.parallel_utils.communication_op import (
     tensor_model_parallel_all_reduce)
 from vllm.model_executor.parallel_utils.parallel_state import (
@@ -292,6 +293,7 @@ def __init__(
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            org_num_embeddings=self.org_vocab_size,
         )
         self.layers = nn.ModuleList([
             MixtralDecoderLayer(config, linear_method=linear_method)
@@ -318,18 +320,50 @@ def forward(
 
 
 class MixtralForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
 
     def __init__(
         self,
         config: MixtralConfig,
         linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
         self.config = config
         self.linear_method = linear_method
         self.model = MixtralModel(config, linear_method)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.sampler = Sampler(config.vocab_size)
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size)
 
     def forward(
         self,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 62f7530868ade..065d589979daa 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -86,11 +86,20 @@ def load_model(self) -> None:
         vocab_size = self.model.config.vocab_size
 
         if self.lora_config:
+            assert hasattr(
+                self.model, "supported_lora_modules"
+            ) and self.model.supported_lora_modules, "Model does not support LoRA"
+            assert hasattr(
+                self.model,
+                "embedding_modules"), "Model does not have embedding_modules"
+            assert hasattr(self.model, "embedding_padding_modules"
+                           ), "Model does not have embedding_padding_modules"
             self.lora_manager = LRUCacheWorkerLoRAManager(
                 self.scheduler_config.max_num_seqs,
                 self.scheduler_config.max_num_batched_tokens +
                 self.scheduler_config.max_paddings, vocab_size,
-                self.lora_config, self.device)
+                self.lora_config, self.device, self.model.embedding_modules,
+                self.model.embedding_padding_modules)
             self.model = self.lora_manager.create_lora_manager(self.model)
 
     def set_block_size(self, block_size: int) -> None:

From 7eacffd9512c29bfcce0963b5a19da0cd66cc22f Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Tue, 13 Feb 2024 17:12:05 -0800
Subject: [PATCH 061/112] Migrate InternLMForCausalLM to LlamaForCausalLM
 (#2860)

Co-authored-by: Roy <jasonailu87@gmail.com>
---
 vllm/model_executor/models/__init__.py |   2 +-
 vllm/model_executor/models/internlm.py | 299 -------------------------
 vllm/model_executor/models/llama.py    |   6 +-
 3 files changed, 5 insertions(+), 302 deletions(-)
 delete mode 100644 vllm/model_executor/models/internlm.py

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index fb519b3c0cf92..2985e9c69ae34 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -24,7 +24,7 @@
     "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
     "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
-    "InternLMForCausalLM": ("internlm", "InternLMForCausalLM"),
+    "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     # For decapoda-research/llama-*
diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py
deleted file mode 100644
index 5d0b93793c89d..0000000000000
--- a/vllm/model_executor/models/internlm.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# -*- coding: utf-8 -*-
-from typing import Any, Dict, List, Optional, Tuple
-
-import torch
-from torch import nn
-from transformers import LlamaConfig
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
-from vllm.sequence import SamplerOutput
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class InternLMMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           linear_method=linear_method)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class InternLMAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        bias: bool,
-        rope_theta: float = 10000,
-        max_position_embeddings: int = 8192,
-        linear_method: Optional[LinearMethodBase] = None,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        tensor_model_parallel_world_size = (
-            get_tensor_model_parallel_world_size())
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = (self.total_num_heads //
-                          tensor_model_parallel_world_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            bias=bias,
-            linear_method=linear_method,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=bias,
-            linear_method=linear_method,
-        )
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class InternLMDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: LlamaConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = InternLMAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            bias=config.bias,
-            rope_theta=rope_theta,
-            max_position_embeddings=max_position_embeddings,
-            linear_method=linear_method,
-            rope_scaling=getattr(config, "rope_scaling", None),
-        )
-        self.mlp = InternLMMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            linear_method=linear_method,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-
-
-class InternLMModel(nn.Module):
-
-    def __init__(
-        self,
-        config: LlamaConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        vocab_size = ((config.vocab_size + 63) // 64) * 64
-        self.embed_tokens = VocabParallelEmbedding(
-            vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            InternLMDecoderLayer(config, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                residual,
-            )
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-
-class InternLMForCausalLM(nn.Module):
-
-    def __init__(
-        self,
-        config,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.linear_method = linear_method
-        self.model = InternLMModel(config, linear_method)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata)
-        return hidden_states
-
-    def sample(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   sampling_metadata)
-        return next_tokens
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 860a8f267acf8..6202e81fffa7c 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -91,6 +91,7 @@ def __init__(
         rope_scaling: Optional[Dict[str, Any]] = None,
         max_position_embeddings: int = 8192,
         linear_method: Optional[LinearMethodBase] = None,
+        bias: bool = False,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -120,13 +121,13 @@ def __init__(
             self.head_dim,
             self.total_num_heads,
             self.total_num_kv_heads,
-            bias=False,
+            bias=bias,
             linear_method=linear_method,
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
-            bias=False,
+            bias=bias,
             linear_method=linear_method,
         )
 
@@ -179,6 +180,7 @@ def __init__(
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             linear_method=linear_method,
+            bias=getattr(config, "bias", False),
         )
         self.mlp = LlamaMLP(
             hidden_size=self.hidden_size,

From 0c48b37c310254e83cd2906230e87af97cb148ba Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Tue, 13 Feb 2024 18:01:15 -0800
Subject: [PATCH 062/112] Fix internlm after
 https://github.com/vllm-project/vllm/pull/2860 (#2861)

---
 vllm/model_executor/models/llama.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 6202e81fffa7c..1d0353d7d396e 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -175,7 +175,8 @@ def __init__(
         self.self_attn = LlamaAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,

From 7e45107f51bcb38c22dd9916c61226078e8eb26d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 13 Feb 2024 19:52:34 -0800
Subject: [PATCH 063/112] [Fix] Fix memory profiling when GPU is used by
 multiple processes (#2863)

---
 vllm/worker/worker.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index b616040367c84..c460e2e01fc85 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -84,6 +84,8 @@ def init_model(self, cupy_port: Optional[int] = None) -> None:
             torch.cuda.set_device(self.device)
 
             _check_if_gpu_supports_dtype(self.model_config.dtype)
+            torch.cuda.empty_cache()
+            self.init_gpu_memory = torch.cuda.mem_get_info()[0]
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
@@ -126,7 +128,9 @@ def profile_num_available_blocks(
         # profiled peak memory.
         torch.cuda.synchronize()
         free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
-        peak_memory = total_gpu_memory - free_gpu_memory
+        # NOTE(woosuk): Here we assume that the other processes using the same
+        # GPU did not change their memory usage during the profiling.
+        peak_memory = self.init_gpu_memory - free_gpu_memory
 
         cache_block_size = CacheEngine.get_cache_block_size(
             block_size, cache_dtype, self.model_config, self.parallel_config)

From 87069ccf68c1bd74aec5ff58db360977f0d9d757 Mon Sep 17 00:00:00 2001
From: Nikola Borisov <nikola.borisof@gmail.com>
Date: Wed, 14 Feb 2024 10:17:57 -0800
Subject: [PATCH 064/112] Fix docker python version (#2845)

---
 Dockerfile       | 21 +++++----------------
 requirements.txt |  2 +-
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 3db86adf19a91..dd4867702d3de 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,21 +4,8 @@
 #################### BASE BUILD IMAGE ####################
 FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
 
-# Set the DEBIAN_FRONTEND variable to noninteractive to avoid interactive prompts
-ENV DEBIAN_FRONTEND=noninteractive
-
-# Preconfigure tzdata for US Central Time (build running in us-central-1 but this really doesn't matter.)
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Chicago' | debconf-set-selections
-
-# We install an older version of python here for testing to make sure vllm works with older versions of Python.
-# For the actual openai compatible server, we will use the latest version of Python.
 RUN apt-get update -y \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa -y \
-    && apt-get update -y \
-    && apt-get install -y python3.8 python3.8-dev python3.8-venv python3-pip git \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
+    && apt-get install -y python3-pip git
 
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@@ -88,8 +75,10 @@ RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip instal
 
 
 #################### RUNTIME BASE IMAGE ####################
-# use CUDA base as CUDA runtime dependencies are already installed via pip
-FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
+# We used base cuda image because pytorch installs its own cuda libraries.
+# However cupy depends on cuda libraries so we had to switch to the runtime image
+# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
+FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
 
 # libnccl required for ray
 RUN apt-get update -y \
diff --git a/requirements.txt b/requirements.txt
index b030ae616919b..4fcfe661185b5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,4 +12,4 @@ pydantic >= 2.0  # Required for OpenAI server.
 aioprometheus[starlette]
 pynvml == 11.5.0
 triton >= 2.1.0
-cupy-cuda12x == 12.3.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.
+cupy-cuda12x == 12.1.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.

From 4efbac6d3593ed35fd5b6ccb3958bd96b2c9b4da Mon Sep 17 00:00:00 2001
From: Roy <jasonailu87@gmail.com>
Date: Thu, 15 Feb 2024 04:30:24 +0800
Subject: [PATCH 065/112] Migrate AquilaForCausalLM to LlamaForCausalLM (#2867)

---
 vllm/model_executor/models/__init__.py      |   5 +-
 vllm/model_executor/models/aquila.py        | 342 --------------------
 vllm/transformers_utils/config.py           |   1 -
 vllm/transformers_utils/configs/__init__.py |   2 -
 vllm/transformers_utils/configs/aquila.py   |  69 ----
 5 files changed, 2 insertions(+), 417 deletions(-)
 delete mode 100644 vllm/model_executor/models/aquila.py
 delete mode 100644 vllm/transformers_utils/configs/aquila.py

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 2985e9c69ae34..5cba1cf0414db 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -10,8 +10,8 @@
 
 # Architecture -> (module, class).
 _MODELS = {
-    "AquilaModel": ("aquila", "AquilaForCausalLM"),
-    "AquilaForCausalLM": ("aquila", "AquilaForCausalLM"),  # AquilaChat2
+    "AquilaModel": ("llama", "LlamaForCausalLM"),
+    "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
     "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),  # baichuan-7b
     "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),  # baichuan-13b
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
@@ -41,7 +41,6 @@
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
     "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
-    "YiForCausalLM": ("yi", "YiForCausalLM")
 }
 
 # Models not supported by ROCm.
diff --git a/vllm/model_executor/models/aquila.py b/vllm/model_executor/models/aquila.py
deleted file mode 100644
index 2f2bd5ffb4a63..0000000000000
--- a/vllm/model_executor/models/aquila.py
+++ /dev/null
@@ -1,342 +0,0 @@
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional, Tuple
-
-import torch
-from torch import nn
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
-from vllm.sequence import SamplerOutput
-from vllm.transformers_utils.configs.aquila import AquilaConfig
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class AquilaMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           linear_method=linear_method)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class AquilaRMSNorm(nn.Module):
-
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        AquilaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1,
-                                                               keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-
-        return (self.weight * hidden_states).to(input_dtype)
-
-
-class AquilaAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        max_position_embeddings: int = 8192,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        assert self.total_num_kv_heads % tp_size == 0
-        self.num_kv_heads = self.total_num_kv_heads // tp_size
-        self.head_dim = hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=False,
-            linear_method=linear_method,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            linear_method=linear_method,
-        )
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class AquilaDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: AquilaConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = AquilaAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            max_position_embeddings=max_position_embeddings,
-            rope_scaling=rope_scaling,
-            linear_method=linear_method,
-        )
-        self.mlp = AquilaMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            linear_method=linear_method,
-        )
-        self.input_layernorm = AquilaRMSNorm(config.hidden_size,
-                                             eps=config.rms_norm_eps)
-        self.post_attention_layernorm = AquilaRMSNorm(config.hidden_size,
-                                                      eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        # Self Attention
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        return hidden_states
-
-
-class AquilaModel(nn.Module):
-
-    def __init__(
-        self,
-        config: AquilaConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.layers = nn.ModuleList([
-            AquilaDecoderLayer(config, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-            hidden_states = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-            )
-        hidden_states = self.norm(hidden_states)
-
-        return hidden_states
-
-
-class AquilaForCausalLM(nn.Module):
-
-    def __init__(
-        self,
-        config,
-        linear_method: Optional[LinearMethodBase] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.linear_method = linear_method
-        self.model = AquilaModel(config, linear_method)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata)
-        return hidden_states
-
-    def sample(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   sampling_metadata)
-        return next_tokens
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 1660fe181d2cc..b12918e41b32e 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -5,7 +5,6 @@
 from vllm.transformers_utils.configs import *
 
 _CONFIG_REGISTRY = {
-    "aquila": AquilaConfig,
     "baichuan": BaiChuanConfig,
     "chatglm": ChatGLMConfig,
     "mpt": MPTConfig,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 8a3aec9efc572..bbba741ca536a 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,4 +1,3 @@
-from vllm.transformers_utils.configs.aquila import AquilaConfig
 from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
@@ -9,7 +8,6 @@
 from vllm.transformers_utils.configs.falcon import RWConfig
 
 __all__ = [
-    "AquilaConfig",
     "BaiChuanConfig",
     "ChatGLMConfig",
     "MPTConfig",
diff --git a/vllm/transformers_utils/configs/aquila.py b/vllm/transformers_utils/configs/aquila.py
deleted file mode 100644
index 86a6f2ba304af..0000000000000
--- a/vllm/transformers_utils/configs/aquila.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# coding=utf-8
-# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Aquila model configuration"""
-
-from transformers import PretrainedConfig
-
-
-class AquilaConfig(PretrainedConfig):
-    model_type = "aquila"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=100008,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.006,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )

From 25e86b6a616638cea9ce121a6c28c7b1d69615e7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 14 Feb 2024 12:30:44 -0800
Subject: [PATCH 066/112] Don't use cupy NCCL for AMD backends (#2855)

---
 .../parallel_utils/custom_all_reduce.py       |  4 ++++
 vllm/worker/model_runner.py                   | 22 ++++++++++++++-----
 vllm/worker/worker.py                         |  4 +++-
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/parallel_utils/custom_all_reduce.py b/vllm/model_executor/parallel_utils/custom_all_reduce.py
index 628c151761fb2..ce4c8d02f7694 100644
--- a/vllm/model_executor/parallel_utils/custom_all_reduce.py
+++ b/vllm/model_executor/parallel_utils/custom_all_reduce.py
@@ -67,6 +67,10 @@ def get_handle() -> Optional["CustomAllreduce"]:
     return _CA_HANDLE
 
 
+def is_initialized() -> bool:
+    return _CA_HANDLE is not None
+
+
 @contextmanager
 def capture():
     try:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 065d589979daa..a27b7d9cb4dcf 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,3 +1,4 @@
+import contextlib
 import time
 from typing import Dict, List, Optional, Tuple, Set, Union
 
@@ -9,9 +10,9 @@
                          SchedulerConfig)
 from vllm.logger import init_logger
 from vllm.model_executor import get_model, InputMetadata, SamplingMetadata
+from vllm.model_executor.parallel_utils import cupy_utils
 from vllm.model_executor.parallel_utils.communication_op import (
     broadcast_tensor_dict)
-from vllm.model_executor.parallel_utils.cupy_utils import get_nccl_backend
 from vllm.model_executor.parallel_utils.parallel_state import (
     with_cupy_nccl_for_all_reduce)
 from vllm.model_executor.parallel_utils import custom_all_reduce
@@ -659,7 +660,7 @@ def list_loras(self) -> Set[int]:
     def capture_model(self, kv_caches: List[KVCache]) -> None:
         # NOTE(woosuk): This is a hack to ensure that the NCCL backend is never
         # deleted before the CUDA graphs.
-        self.cupy_nccl_backend = get_nccl_backend()
+        self.cupy_nccl_backend = cupy_utils.get_nccl_backend()
 
         assert not self.model_config.enforce_eager
         logger.info("Capturing the model for CUDA graphs. This may lead to "
@@ -689,8 +690,6 @@ def capture_model(self, kv_caches: List[KVCache]) -> None:
             bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
         ]
 
-        # NOTE: Capturing the largest batch size first may help reduce the
-        # memory usage of CUDA graph.
         # NOTE(woosuk): There are 3 backends for all-reduce: custom all-reduce
         # kernel, CuPy NCCL, and PyTorch NCCL. When using CUDA graph, we use
         # either custom all-reduce kernel or CuPy NCCL. When not using CUDA
@@ -698,6 +697,8 @@ def capture_model(self, kv_caches: List[KVCache]) -> None:
         # We always prioritize using custom all-reduce kernel but fall back
         # to PyTorch or CuPy NCCL if it is disabled or not supported.
         with custom_all_reduce.capture():
+            # NOTE: Capturing the largest batch size first may help reduce the
+            # memory usage of CUDA graph.
             for batch_size in reversed(batch_size_capture_list):
                 # Create dummy input_metadata.
                 input_metadata = InputMetadata(
@@ -765,7 +766,7 @@ def capture(
         # Run the model once without capturing the graph.
         # This is to make sure that the captured graph does not include the
         # kernel launches for initial benchmarking (e.g., Triton autotune).
-        with with_cupy_nccl_for_all_reduce():
+        with _maybe_cupy_nccl():
             self.model(
                 input_ids,
                 positions,
@@ -779,7 +780,7 @@ def capture(
         # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
         self.graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(self.graph, pool=memory_pool):  # noqa: SIM117
-            with with_cupy_nccl_for_all_reduce():
+            with _maybe_cupy_nccl():
                 hidden_states = self.model(
                     input_ids,
                     positions,
@@ -830,6 +831,15 @@ def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
 
 
+@contextlib.contextmanager
+def _maybe_cupy_nccl():
+    if cupy_utils.is_initialized() and not custom_all_reduce.is_initialized():
+        with with_cupy_nccl_for_all_reduce():
+            yield
+    else:
+        yield
+
+
 def _pad_to_max(x: List[int], max_len: int, pad: int) -> List[int]:
     assert len(x) <= max_len
     return x + [pad] * (max_len - len(x))
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index c460e2e01fc85..29e4b16fe2594 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -19,6 +19,7 @@
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.model_runner import ModelRunner
 from vllm.lora.request import LoRARequest
+from vllm.utils import is_hip
 
 
 class Worker:
@@ -268,7 +269,8 @@ def init_distributed_environment(
                 "cupy.distributed is already initialized but the cupy world "
                 "size does not match parallel_config.world_size "
                 f"({cupy_world_size} vs. {parallel_config.world_size}).")
-    elif parallel_config.world_size > 1 and cupy_port is not None:
+    elif (parallel_config.world_size > 1 and cupy_port is not None
+          and not is_hip()):
         # NOTE(woosuk): We don't initialize CuPy process group when world size
         # is 1.
         # TODO(woosuk): Support multi-node connection.

From 31348dff03d638eb66abda9bec94b8992de9c7a1 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 14 Feb 2024 16:00:43 -0800
Subject: [PATCH 067/112] Align LoRA code between Mistral and Mixtral (fixes
 #2875) (#2880)

* Fix AttributeError: MixtralModel object has no attribute org_vocab_size.

* Make LoRA logic for Mistral and Mixtral the same

---------

Co-authored-by: Pernekhan Utemuratov <pernekhan@deepinfra.com>
---
 vllm/model_executor/models/mixtral.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 6cb1d84965ecf..0100624a44d78 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -285,15 +285,19 @@ def __init__(
         self,
         config: MixtralConfig,
         linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
         self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
+            self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=self.org_vocab_size,
+            org_num_embeddings=config.vocab_size,
         )
         self.layers = nn.ModuleList([
             MixtralDecoderLayer(config, linear_method=linear_method)
@@ -350,7 +354,9 @@ def __init__(
         super().__init__()
         self.config = config
         self.linear_method = linear_method
-        self.model = MixtralModel(config, linear_method)
+        self.model = MixtralModel(config,
+                                  linear_method,
+                                  lora_config=lora_config)
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size

From d7afab6d3af84c18ecb9cbc478842e3bf62af906 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 14 Feb 2024 22:17:44 -0800
Subject: [PATCH 068/112] [BugFix] Fix GC bug for `LLM` class (#2882)

---
 tests/test_regression.py |  18 +++
 vllm/lora/punica.py      | 334 +++++++++++++++++++--------------------
 2 files changed, 182 insertions(+), 170 deletions(-)

diff --git a/tests/test_regression.py b/tests/test_regression.py
index c48e474bd889f..cb68e9ecfc06b 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -4,6 +4,10 @@
 will never happen again.
 
 """
+import gc
+
+import torch
+
 from vllm import LLM, SamplingParams
 
 
@@ -35,6 +39,20 @@ def test_max_tokens_none():
     assert len(prompts) == len(outputs)
 
 
+def test_gc():
+    llm = LLM("facebook/opt-125m", enforce_eager=True)
+    del llm
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    # The memory allocated for model and KV cache should be released.
+    # The memory allocated for PyTorch and others should be less than 50MB.
+    # Usually, it's around 10MB.
+    allocated = torch.cuda.memory_allocated()
+    assert allocated < 50 * 1024 * 1024
+
+
 if __name__ == "__main__":
     import pytest
     pytest.main([__file__])
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index bcb73ccc19b0e..307a33dcf2820 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -4,173 +4,167 @@
 
 import torch
 
-import_exc = None
-
-try:
-    import vllm._punica_C as punica_kernels
-except ImportError as e:
-    import_exc = e
-
-if import_exc is None:
-
-    def bgmv(
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        indicies: torch.LongTensor,
-        layer_idx: int,
-        scale: float,
-    ):
-        """
-        Semantics:
-          y[i] += (
-              x[i].unsqueeze(0)
-              @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-              * scale
-            ).squeeze(0)
-
-        Args:
-          y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-          x: Shape: `[B, H1]`. Input vectors.
-          w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight
-            matrices.
-          indicies: Shape: `[B]`. Indices of the weight matrices.
-          layer_idx: Layer index of the weight matrices.
-          scale: Scaling factor.
-        """
-        punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
-
-    def add_lora(y: torch.Tensor,
-                 x: torch.Tensor,
-                 wa_t_all: torch.Tensor,
-                 wb_t_all: torch.Tensor,
-                 indicies: torch.LongTensor,
-                 layer_idx: int,
-                 scale: float,
-                 *,
-                 buffer: Optional[torch.Tensor] = None):
-        """
-        Semantics:
-          y[i] += (
-              x[i].unsqueeze(0)
-              @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-              @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-              * scale
-            ).squeeze(0)
-
-        Args:
-          y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-          x: Shape: `[B, H1]`. Input vectors.
-          wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
-            LoRA A matrices.
-          wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
-            LoRA B matrices.
-          indicies: Shape: `[B]`. Indices of the LoRA weights.
-          layer_idx: Layer index of LoRA weights.
-          scale: Scaling factor.
-          buffer: Optional. Shape: `[B, R]`. Temporary buffer.
-        """
-        r = wb_t_all.size(-1)
-        if buffer is None:
-            # We set the buffer to be float32 by default to avoid
-            # numerical innacuracies that would otherwise happen
-            # due to downcasting.
-            buffer = torch.zeros((x.size(0), r),
-                                 dtype=torch.float32,
-                                 device=x.device)
-        punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx,
-                                     1.0)
-        punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx,
-                                     scale)
-
-    def add_lora_slice(y: torch.Tensor,
-                       x: torch.Tensor,
-                       wa_t_all: torch.Tensor,
-                       wb_t_all: torch.Tensor,
-                       indicies: torch.LongTensor,
-                       layer_idx: int,
-                       scale: float,
-                       y_offset: int,
-                       y_slice_size: int,
-                       *,
-                       buffer: Optional[torch.Tensor] = None):
-        """
-        Same as `add_lora` but you can operate on slices of y.
-        Pass whole y, define y_offset and y_slice_size.
-
-        Semantics:
-          y[i] += (
-              x[i].unsqueeze(0)
-              @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-              @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-              * scale
-            ).squeeze(0)
-
-        Args:
-          y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-          x: Shape: `[B, H1]`. Input vectors.
-          wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
-            LoRA A matrices.
-          wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
-            LoRA B matrices.
-          indicies: Shape: `[B]`. Indices of the LoRA weights.
-          layer_idx: Layer index of LoRA weights.
-          scale: Scaling factor.
-          y_offset: Offset to apply to the starting column of y.
-          y_slice_size: Size of the y column slice.
-        """
-        r = wb_t_all.size(-1)
-        if buffer is None:
-            # We set the buffer to be float32 by default to avoid
-            # numerical inaccuracies that would otherwise happen
-            # due to downcasting.
-            buffer = torch.zeros((x.size(0), r),
-                                 dtype=torch.float32,
-                                 device=x.device)
-        punica_kernels.dispatch_bgmv_low_level(
-            buffer,
-            x,
-            wa_t_all,
-            indicies,
-            layer_idx,
-            1.0,
-            x.size(1),
-            buffer.size(1),
-            0,
-        )
-        punica_kernels.dispatch_bgmv_low_level(
-            y,
-            buffer,
-            wb_t_all,
-            indicies,
-            layer_idx,
-            scale,
-            buffer.size(1),
-            y_slice_size,
-            y_offset,
-        )
-
-else:
-
-    def _raise_exc(
-        *args,  # pylint: disable=unused-argument
-        **kwargs  # pylint: disable=unused-argument
-    ):
-        if torch.cuda.get_device_capability() < (8, 0):
-            raise ImportError("punica LoRA kernels require compute "
-                              "capability>=8.0") from import_exc
-        else:
-            raise ImportError(
-                "punica LoRA kernels could not be imported. If you built vLLM "
-                "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
-                "was set.") from import_exc
-
-    bgmv = _raise_exc
-    add_lora = _raise_exc
-    add_lora_slice = _raise_exc
-
-__all__ = [
-    "bgmv",
-    "add_lora",
-    "add_lora_slice",
-]
+
+def _raise_import_error(e):
+    if torch.cuda.get_device_capability() < (8, 0):
+        raise ImportError(
+            "punica LoRA kernels require compute capability >= 8.0") from e
+    else:
+        raise ImportError(
+            "punica LoRA kernels could not be imported. If you built vLLM "
+            "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
+            "was set.") from e
+
+
+def bgmv(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w_t_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+    scale: float,
+):
+    """
+    Semantics:
+      y[i] += (
+          x[i].unsqueeze(0)
+          @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          * scale
+        ).squeeze(0)
+
+    Args:
+      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+      x: Shape: `[B, H1]`. Input vectors.
+      w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight
+        matrices.
+      indicies: Shape: `[B]`. Indices of the weight matrices.
+      layer_idx: Layer index of the weight matrices.
+      scale: Scaling factor.
+    """
+    try:
+        import vllm._punica_C as punica_kernels
+    except ImportError as e:
+        _raise_import_error(e)
+
+    punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
+
+
+def add_lora(y: torch.Tensor,
+             x: torch.Tensor,
+             wa_t_all: torch.Tensor,
+             wb_t_all: torch.Tensor,
+             indicies: torch.LongTensor,
+             layer_idx: int,
+             scale: float,
+             *,
+             buffer: Optional[torch.Tensor] = None):
+    """
+    Semantics:
+      y[i] += (
+          x[i].unsqueeze(0)
+          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          * scale
+        ).squeeze(0)
+
+    Args:
+      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+      x: Shape: `[B, H1]`. Input vectors.
+      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
+        LoRA A matrices.
+      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
+        LoRA B matrices.
+      indicies: Shape: `[B]`. Indices of the LoRA weights.
+      layer_idx: Layer index of LoRA weights.
+      scale: Scaling factor.
+      buffer: Optional. Shape: `[B, R]`. Temporary buffer.
+    """
+    try:
+        import vllm._punica_C as punica_kernels
+    except ImportError as e:
+        _raise_import_error(e)
+
+    r = wb_t_all.size(-1)
+    if buffer is None:
+        # We set the buffer to be float32 by default to avoid
+        # numerical innacuracies that would otherwise happen
+        # due to downcasting.
+        buffer = torch.zeros((x.size(0), r),
+                             dtype=torch.float32,
+                             device=x.device)
+    punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
+    punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx,
+                                 scale)
+
+
+def add_lora_slice(y: torch.Tensor,
+                   x: torch.Tensor,
+                   wa_t_all: torch.Tensor,
+                   wb_t_all: torch.Tensor,
+                   indicies: torch.LongTensor,
+                   layer_idx: int,
+                   scale: float,
+                   y_offset: int,
+                   y_slice_size: int,
+                   *,
+                   buffer: Optional[torch.Tensor] = None):
+    """
+    Same as `add_lora` but you can operate on slices of y.
+    Pass whole y, define y_offset and y_slice_size.
+
+    Semantics:
+      y[i] += (
+          x[i].unsqueeze(0)
+          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          * scale
+        ).squeeze(0)
+
+    Args:
+      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+      x: Shape: `[B, H1]`. Input vectors.
+      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
+        LoRA A matrices.
+      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
+        LoRA B matrices.
+      indicies: Shape: `[B]`. Indices of the LoRA weights.
+      layer_idx: Layer index of LoRA weights.
+      scale: Scaling factor.
+      y_offset: Offset to apply to the starting column of y.
+      y_slice_size: Size of the y column slice.
+    """
+    try:
+        import vllm._punica_C as punica_kernels
+    except ImportError as e:
+        _raise_import_error(e)
+
+    r = wb_t_all.size(-1)
+    if buffer is None:
+        # We set the buffer to be float32 by default to avoid
+        # numerical inaccuracies that would otherwise happen
+        # due to downcasting.
+        buffer = torch.zeros((x.size(0), r),
+                             dtype=torch.float32,
+                             device=x.device)
+    punica_kernels.dispatch_bgmv_low_level(
+        buffer,
+        x,
+        wa_t_all,
+        indicies,
+        layer_idx,
+        1.0,
+        x.size(1),
+        buffer.size(1),
+        0,
+    )
+    punica_kernels.dispatch_bgmv_low_level(
+        y,
+        buffer,
+        wb_t_all,
+        indicies,
+        layer_idx,
+        scale,
+        buffer.size(1),
+        y_slice_size,
+        y_offset,
+    )

From 4f2ad1113553211778640c648e11f5aa2e03dbd4 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 14 Feb 2024 22:29:57 -0800
Subject: [PATCH 069/112] Fix DeciLM (#2883)

---
 vllm/model_executor/models/decilm.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
index 984be0cccd16d..07aa4b72bf7a0 100644
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -28,6 +28,7 @@
 import torch
 from transformers import PretrainedConfig
 
+from vllm.config import LoRAConfig
 from vllm.model_executor.layers.linear import LinearMethodBase
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.model_executor.weight_utils import (default_weight_loader,
@@ -56,10 +57,13 @@ def __init__(
         self,
         config: Optional[PretrainedConfig] = None,
         linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         config.num_key_value_heads = max(config.num_key_value_heads_per_layer)
         delattr(config, "num_key_value_heads_per_layer")
-        super().__init__(config=config, linear_method=linear_method)
+        super().__init__(config=config,
+                         linear_method=linear_method,
+                         lora_config=lora_config)
 
     def load_weights(self,
                      model_name_or_path: str,

From 5255d99dc595f9ae7647842242d6542aa4145a4f Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Thu, 15 Feb 2024 13:22:39 -0500
Subject: [PATCH 070/112] [ROCm] Dockerfile fix for flash-attention build
 (#2885)

---
 Dockerfile.rocm | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index e0ef4a0f4131a..54ae06be6e101 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -56,10 +56,10 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
 ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
 
 # Install ROCm flash-attention
-RUN if [ "$BUILD_FA" == "1" ]; then \
+RUN if [ "$BUILD_FA" = "1" ]; then \
     mkdir libs \
     && cd libs \
-    && git clone https://github.com/ROCmSoftwarePlatform/flash-attention.git \
+    && git clone https://github.com/ROCm/flash-attention.git \
     && cd flash-attention \
     && git checkout ${FA_BRANCH} \
     && git submodule update --init \
@@ -83,7 +83,7 @@ RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
 RUN cd /app \
     && cd vllm \
     && pip install -U -r requirements-rocm.txt \
-    && if [ "$BUILD_FA" == "1" ]; then \
+    && if [ "$BUILD_FA" = "1" ]; then \
        bash patch_xformers.rocm.sh; fi \
     && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
     && python3 setup.py install \

From 64da65b3225b7a2e6c2b161b726dc9751f973f33 Mon Sep 17 00:00:00 2001
From: "shiyi.c_98" <shicao@berkeley.edu>
Date: Fri, 16 Feb 2024 14:17:55 -0800
Subject: [PATCH 071/112] Prefix Caching- fix t4 triton error (#2517)

---
 vllm/model_executor/layers/triton_kernel/prefix_prefill.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
index 8fa70054f02ca..ba40d42307fab 100644
--- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
+++ b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
@@ -618,7 +618,9 @@ def context_attention_fwd(q,
                               b_ctx_len,
                               max_input_len,
                               alibi_slopes=None):
-        BLOCK = 128
+
+        cap = torch.cuda.get_device_capability()
+        BLOCK = 128 if cap[0] >= 8 else 64
         # shape constraints
         Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
         assert Lq == Lk and Lk == Lv

From 5f08050d8d0bfcdaced0fe706cdfc9e311e0f263 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 16 Feb 2024 15:05:18 -0800
Subject: [PATCH 072/112] Bump up to v0.3.1 (#2887)

---
 vllm/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/__init__.py b/vllm/__init__.py
index 36d177f5942e7..e3234c009c1dc 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -8,7 +8,7 @@
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import SamplingParams
 
-__version__ = "0.3.0"
+__version__ = "0.3.1"
 
 __all__ = [
     "LLM",

From 185b2c29e241c864c9660ef0b69b5076a370f55e Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Sat, 17 Feb 2024 11:18:04 -0800
Subject: [PATCH 073/112] Defensively copy `sampling_params` (#2881)

If the SamplingParams object passed to LLMEngine.add_request() is mutated after it returns, it could affect the async sampling process for that request.

Suggested by @Yard1 https://github.com/vllm-project/vllm/pull/2514#discussion_r1490106059
---
 vllm/engine/llm_engine.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 86f0925209309..29fa952665397 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -464,6 +464,9 @@ def add_request(
             prompt_token_ids[:prefix_pos], lora_request.lora_int_id
             if lora_request else 0) if prefix_pos is not None else None
 
+        # Defensive copy of SamplingParams, which are used by the sampler
+        sampling_params = copy.deepcopy(sampling_params)
+
         # Create the sequence group.
         seq_group = SequenceGroup(request_id, [seq], sampling_params,
                                   arrival_time, lora_request, prefix)

From 8f36444c4f9a55669bcb64e20b5588c0dd72bd93 Mon Sep 17 00:00:00 2001
From: jvmncs <jvmncs@gmail.com>
Date: Sat, 17 Feb 2024 15:00:48 -0500
Subject: [PATCH 074/112] multi-LoRA as extra models in OpenAI server (#2775)

how to serve the loras (mimicking the [multilora inference example](https://github.com/vllm-project/vllm/blob/main/examples/multilora_inference.py)):
```terminal
$ export LORA_PATH=~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/
$ python -m vllm.entrypoints.api_server \
 --model meta-llama/Llama-2-7b-hf \
 --enable-lora \
 --lora-modules sql-lora=$LORA_PATH sql-lora2=$LORA_PATH
```
the above server will list 3 separate values if the user queries `/models`: one for the base served model, and one each for the specified lora modules. in this case sql-lora and sql-lora2 point to the same underlying lora, but this need not be the case. lora config values take the same values they do in EngineArgs

no work has been done here to scope client permissions to specific models
---
 docs/source/models/lora.rst                   | 41 ++++++++-
 examples/multilora_inference.py               |  4 +-
 tests/entrypoints/test_openai_server.py       | 89 +++++++++++++++----
 vllm/entrypoints/openai/api_server.py         | 24 ++++-
 vllm/entrypoints/openai/serving_chat.py       | 13 ++-
 vllm/entrypoints/openai/serving_completion.py | 15 +++-
 vllm/entrypoints/openai/serving_engine.py     | 41 ++++++++-
 7 files changed, 200 insertions(+), 27 deletions(-)

diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst
index b773edfc6ff2b..1910f26506611 100644
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@@ -49,4 +49,43 @@ the third parameter is the path to the LoRA adapter.
 
 
 Check out `examples/multilora_inference.py <https://github.com/vllm-project/vllm/blob/main/examples/multilora_inference.py>`_
-for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
\ No newline at end of file
+for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+
+Serving LoRA Adapters
+---------------------
+LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use
+``--lora-modules {name}={path} {name}={path}`` to specify each LoRA module when we kickoff the server:
+
+.. code-block:: bash
+
+    python -m vllm.entrypoints.api_server \
+        --model meta-llama/Llama-2-7b-hf \
+        --enable-lora \
+        --lora-modules sql-lora=~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/
+
+The server entrypoint accepts all other LoRA configuration parameters (``max_loras``, ``max_lora_rank``, ``max_cpu_loras``,
+etc.), which will apply to all forthcoming requests. Upon querying the ``/models`` endpoint, we should see our LoRA along
+with its base model:
+
+.. code-block:: bash
+
+    curl localhost:8000/v1/models | jq .
+    {
+        "object": "list",
+        "data": [
+            {
+                "id": "meta-llama/Llama-2-7b-hf",
+                "object": "model",
+                ...
+            },
+            {
+                "id": "sql-lora",
+                "object": "model",
+                ...
+            }
+        ]
+    }
+
+Requests can specify the LoRA adapter as if it were any other model via the ``model`` request parameter. The requests will be
+processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
+LoRA adapter requests if they were provided and ``max_loras`` is set high enough).
diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py
index 8fdd243af69ff..cd4451481ca83 100644
--- a/examples/multilora_inference.py
+++ b/examples/multilora_inference.py
@@ -12,7 +12,9 @@
 from vllm.lora.request import LoRARequest
 
 
-def create_test_prompts(lora_path: str) -> List[Tuple[str, SamplingParams]]:
+def create_test_prompts(
+        lora_path: str
+) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
     """Create a list of test prompts with their sampling parameters.
     
     2 requests for base model, 4 requests for the LoRA. We define 2
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 54522f0a99fa1..3a359502c39d5 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -7,9 +7,11 @@
 import requests
 import ray  # using Ray for overall ease of process management, parallel requests, and debugging.
 import openai  # use the official client for correctness check
+from huggingface_hub import snapshot_download  # downloading lora to test lora requests
 
 MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"  # any model with a chat template should work here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"  # technically this needs Mistral-7B-v0.1 as base, but we're not testing generation quality here
 
 pytestmark = pytest.mark.asyncio
 
@@ -54,7 +56,12 @@ def __del__(self):
 
 
 @pytest.fixture(scope="session")
-def server():
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="session")
+def server(zephyr_lora_files):
     ray.init()
     server_runner = ServerRunner.remote([
         "--model",
@@ -64,6 +71,17 @@ def server():
         "--max-model-len",
         "8192",
         "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128"
     ])
     ray.get(server_runner.ready.remote())
     yield server_runner
@@ -79,8 +97,25 @@ def client():
     yield client
 
 
-async def test_single_completion(server, client: openai.AsyncOpenAI):
-    completion = await client.completions.create(model=MODEL_NAME,
+async def test_check_models(server, client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert all(model.root == MODEL_NAME for model in models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"
+
+
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+async def test_single_completion(server, client: openai.AsyncOpenAI,
+                                 model_name: str):
+    completion = await client.completions.create(model=model_name,
                                                  prompt="Hello, my name is",
                                                  max_tokens=5,
                                                  temperature=0.0)
@@ -104,7 +139,13 @@ async def test_single_completion(server, client: openai.AsyncOpenAI):
         completion.choices[0].text) >= 5
 
 
-async def test_single_chat_session(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_single_chat_session(server, client: openai.AsyncOpenAI,
+                                   model_name: str):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -115,7 +156,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI):
 
     # test single completion
     chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
         messages=messages,
         max_tokens=10,
     )
@@ -139,11 +180,17 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI):
     assert message.content is not None and len(message.content) >= 0
 
 
-async def test_completion_streaming(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_completion_streaming(server, client: openai.AsyncOpenAI,
+                                    model_name: str):
     prompt = "What is an LLM?"
 
     single_completion = await client.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
         prompt=prompt,
         max_tokens=5,
         temperature=0.0,
@@ -152,7 +199,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI):
     single_usage = single_completion.usage
 
     stream = await client.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
         prompt=prompt,
         max_tokens=5,
         temperature=0.0,
@@ -166,7 +213,13 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI):
     assert "".join(chunks) == single_output
 
 
-async def test_chat_streaming(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_chat_streaming(server, client: openai.AsyncOpenAI,
+                              model_name: str):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -177,7 +230,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI):
 
     # test single completion
     chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
         messages=messages,
         max_tokens=10,
         temperature=0.0,
@@ -187,7 +240,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI):
 
     # test streaming
     stream = await client.chat.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
         messages=messages,
         max_tokens=10,
         temperature=0.0,
@@ -204,10 +257,16 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI):
     assert "".join(chunks) == output
 
 
-async def test_batch_completions(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_batch_completions(server, client: openai.AsyncOpenAI,
+                                 model_name: str):
     # test simple list
     batch = await client.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
         prompt=["Hello, my name is", "Hello, my name is"],
         max_tokens=5,
         temperature=0.0,
@@ -217,7 +276,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI):
 
     # test n = 2
     batch = await client.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
         prompt=["Hello, my name is", "Hello, my name is"],
         n=2,
         max_tokens=5,
@@ -236,7 +295,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI):
 
     # test streaming
     batch = await client.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
         prompt=["Hello, my name is", "Hello, my name is"],
         max_tokens=5,
         temperature=0.0,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index deb0fddd643cc..a217605452e3a 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -23,6 +23,7 @@
 from vllm.logger import init_logger
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+from vllm.entrypoints.openai.serving_engine import LoRA
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 
@@ -48,6 +49,16 @@ async def _force_log():
 app = fastapi.FastAPI(lifespan=lifespan)
 
 
+class LoRAParserAction(argparse.Action):
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        lora_list = []
+        for item in values:
+            name, path = item.split('=')
+            lora_list.append(LoRA(name, path))
+        setattr(namespace, self.dest, lora_list)
+
+
 def parse_args():
     parser = argparse.ArgumentParser(
         description="vLLM OpenAI-Compatible RESTful API server.")
@@ -81,6 +92,15 @@ def parse_args():
                         help="The model name used in the API. If not "
                         "specified, the model name will be the same as "
                         "the huggingface name.")
+    parser.add_argument(
+        "--lora-modules",
+        type=str,
+        default=None,
+        nargs='+',
+        action=LoRAParserAction,
+        help=
+        "LoRA module configurations in the format name=path. Multiple modules can be specified."
+    )
     parser.add_argument("--chat-template",
                         type=str,
                         default=None,
@@ -217,8 +237,10 @@ async def authentication(request: Request, call_next):
     engine = AsyncLLMEngine.from_engine_args(engine_args)
     openai_serving_chat = OpenAIServingChat(engine, served_model,
                                             args.response_role,
+                                            args.lora_modules,
                                             args.chat_template)
-    openai_serving_completion = OpenAIServingCompletion(engine, served_model)
+    openai_serving_completion = OpenAIServingCompletion(
+        engine, served_model, args.lora_modules)
 
     # Register labels for metrics
     add_global_metrics_labels(model_name=engine_args.model)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index a9e4c355560b8..850797ae4b9b6 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,7 +1,7 @@
 import time
 import codecs
 from fastapi import Request
-from typing import AsyncGenerator, AsyncIterator, Union
+from typing import AsyncGenerator, AsyncIterator, Optional, List, Union
 from vllm.logger import init_logger
 from vllm.utils import random_uuid
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -11,7 +11,7 @@
     ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
     UsageInfo)
 from vllm.outputs import RequestOutput
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
 
 logger = init_logger(__name__)
 
@@ -22,8 +22,11 @@ def __init__(self,
                  engine: AsyncLLMEngine,
                  served_model: str,
                  response_role: str,
+                 lora_modules: Optional[List[LoRA]] = None,
                  chat_template=None):
-        super().__init__(engine=engine, served_model=served_model)
+        super().__init__(engine=engine,
+                         served_model=served_model,
+                         lora_modules=lora_modules)
         self.response_role = response_role
         self._load_chat_template(chat_template)
 
@@ -64,11 +67,13 @@ async def create_chat_completion(
             token_ids = self._validate_prompt_and_tokenize(request,
                                                            prompt=prompt)
             sampling_params = request.to_sampling_params()
+            lora_request = self._maybe_get_lora(request)
         except ValueError as e:
             return self.create_error_response(str(e))
 
         result_generator = self.engine.generate(prompt, sampling_params,
-                                                request_id, token_ids)
+                                                request_id, token_ids,
+                                                lora_request)
         # Streaming response
         if request.stream:
             return self.chat_completion_stream_generator(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 191142d222ea7..667b659f81e9e 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -15,7 +15,7 @@
     UsageInfo,
 )
 from vllm.outputs import RequestOutput
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
 
 logger = init_logger(__name__)
 
@@ -249,8 +249,13 @@ async def consumer():
 
 class OpenAIServingCompletion(OpenAIServing):
 
-    def __init__(self, engine: AsyncLLMEngine, served_model: str):
-        super().__init__(engine=engine, served_model=served_model)
+    def __init__(self,
+                 engine: AsyncLLMEngine,
+                 served_model: str,
+                 lora_modules: Optional[List[LoRA]] = None):
+        super().__init__(engine=engine,
+                         served_model=served_model,
+                         lora_modules=lora_modules)
 
     async def create_completion(self, request: CompletionRequest,
                                 raw_request: Request):
@@ -284,6 +289,7 @@ async def create_completion(self, request: CompletionRequest,
         generators = []
         try:
             sampling_params = request.to_sampling_params()
+            lora_request = self._maybe_get_lora(request)
             prompt_is_tokens, prompts = parse_prompt_format(request.prompt)
 
             for i, prompt in enumerate(prompts):
@@ -298,7 +304,8 @@ async def create_completion(self, request: CompletionRequest,
                     self.engine.generate(None,
                                          sampling_params,
                                          f"{request_id}-{i}",
-                                         prompt_token_ids=input_ids))
+                                         prompt_token_ids=input_ids,
+                                         lora_request=lora_request))
         except ValueError as e:
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 390f9aeb89217..09945471e9af0 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,4 +1,5 @@
 import asyncio
+from dataclasses import dataclass
 from http import HTTPStatus
 from typing import Dict, List, Optional, Union
 from vllm.logger import init_logger
@@ -9,15 +10,35 @@
                                               ErrorResponse, LogProbs,
                                               ModelCard, ModelList,
                                               ModelPermission)
+from vllm.lora.request import LoRARequest
 
 logger = init_logger(__name__)
 
 
+@dataclass
+class LoRA:
+    name: str
+    local_path: str
+
+
 class OpenAIServing:
 
-    def __init__(self, engine: AsyncLLMEngine, served_model: str):
+    def __init__(self,
+                 engine: AsyncLLMEngine,
+                 served_model: str,
+                 lora_modules=Optional[List[LoRA]]):
         self.engine = engine
         self.served_model = served_model
+        if lora_modules is None:
+            self.lora_requests = []
+        else:
+            self.lora_requests = [
+                LoRARequest(
+                    lora_name=lora.name,
+                    lora_int_id=i,
+                    lora_local_path=lora.local_path,
+                ) for i, lora in enumerate(lora_modules, start=1)
+            ]
 
         self.max_model_len = 0
         self.tokenizer = None
@@ -50,6 +71,13 @@ async def show_available_models(self) -> ModelList:
                       root=self.served_model,
                       permission=[ModelPermission()])
         ]
+        lora_cards = [
+            ModelCard(id=lora.lora_name,
+                      root=self.served_model,
+                      permission=[ModelPermission()])
+            for lora in self.lora_requests
+        ]
+        model_cards.extend(lora_cards)
         return ModelList(data=model_cards)
 
     def _create_logprobs(
@@ -99,11 +127,22 @@ def create_error_response(
     async def _check_model(self, request) -> Optional[ErrorResponse]:
         if request.model == self.served_model:
             return
+        if request.model in [lora.lora_name for lora in self.lora_requests]:
+            return
         return self.create_error_response(
             message=f"The model `{request.model}` does not exist.",
             err_type="NotFoundError",
             status_code=HTTPStatus.NOT_FOUND)
 
+    def _maybe_get_lora(self, request) -> Optional[LoRARequest]:
+        if request.model == self.served_model:
+            return
+        for lora in self.lora_requests:
+            if request.model == lora.lora_name:
+                return lora
+        # if _check_model has been called earlier, this will be unreachable
+        raise ValueError("The model `{request.model}` does not exist.")
+
     def _validate_prompt_and_tokenize(
             self,
             request: Union[ChatCompletionRequest, CompletionRequest],

From 786b7f18a541a0460a9ee56154558ac7121601ac Mon Sep 17 00:00:00 2001
From: Mark Mozolewski <57800471+mbm-ai@users.noreply.github.com>
Date: Sat, 17 Feb 2024 22:36:53 -0800
Subject: [PATCH 075/112] Add code-revision config argument for Hugging Face
 Hub (#2892)

---
 vllm/config.py                    |  8 +++++++-
 vllm/engine/arg_utils.py          | 21 ++++++++++++++-------
 vllm/transformers_utils/config.py | 12 +++++++++---
 3 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 27c61d4d50439..0b8a2a27f6d43 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -44,6 +44,9 @@ class ModelConfig:
         revision: The specific model version to use. It can be a branch name,
             a tag name, or a commit id. If unspecified, will use the default
             version.
+        code_revision: The specific revision to use for the model code on
+            Hugging Face Hub. It can be a branch name, a tag name, or a 
+            commit id. If unspecified, will use the default version.
         tokenizer_revision: The specific tokenizer version to use. It can be a
             branch name, a tag name, or a commit id. If unspecified, will use
             the default version.
@@ -70,6 +73,7 @@ def __init__(
         dtype: Union[str, torch.dtype],
         seed: int,
         revision: Optional[str] = None,
+        code_revision: Optional[str] = None,
         tokenizer_revision: Optional[str] = None,
         max_model_len: Optional[int] = None,
         quantization: Optional[str] = None,
@@ -84,6 +88,7 @@ def __init__(
         self.load_format = load_format
         self.seed = seed
         self.revision = revision
+        self.code_revision = code_revision
         self.tokenizer_revision = tokenizer_revision
         self.quantization = quantization
         self.enforce_eager = enforce_eager
@@ -103,7 +108,8 @@ def __init__(
             self.download_dir = model_path
             self.tokenizer = model_path
 
-        self.hf_config = get_config(self.model, trust_remote_code, revision)
+        self.hf_config = get_config(self.model, trust_remote_code, revision,
+                                    code_revision)
         self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
         self.max_model_len = _get_and_verify_max_len(self.hf_config,
                                                      max_model_len)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d5e63e25d6e85..8ac0157151d8e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -32,6 +32,7 @@ class EngineArgs:
     max_paddings: int = 256
     disable_log_stats: bool = False
     revision: Optional[str] = None
+    code_revision: Optional[str] = None
     tokenizer_revision: Optional[str] = None
     quantization: Optional[str] = None
     enforce_eager: bool = False
@@ -75,6 +76,13 @@ def add_cli_args(
             help='the specific model version to use. It can be a branch '
             'name, a tag name, or a commit id. If unspecified, will use '
             'the default version.')
+        parser.add_argument(
+            '--code-revision',
+            type=str,
+            default=None,
+            help='the specific revision to use for the model code on '
+            'Hugging Face Hub. It can be a branch name, a tag name, or a '
+            'commit id. If unspecified, will use the default version.')
         parser.add_argument(
             '--tokenizer-revision',
             type=str,
@@ -279,13 +287,12 @@ def create_engine_configs(
     ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig,
                DeviceConfig, Optional[LoRAConfig]]:
         device_config = DeviceConfig(self.device)
-        model_config = ModelConfig(self.model, self.tokenizer,
-                                   self.tokenizer_mode, self.trust_remote_code,
-                                   self.download_dir, self.load_format,
-                                   self.dtype, self.seed, self.revision,
-                                   self.tokenizer_revision, self.max_model_len,
-                                   self.quantization, self.enforce_eager,
-                                   self.max_context_len_to_capture)
+        model_config = ModelConfig(
+            self.model, self.tokenizer, self.tokenizer_mode,
+            self.trust_remote_code, self.download_dir, self.load_format,
+            self.dtype, self.seed, self.revision, self.code_revision,
+            self.tokenizer_revision, self.max_model_len, self.quantization,
+            self.enforce_eager, self.max_context_len_to_capture)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index b12918e41b32e..491cb4d9a427c 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -16,10 +16,14 @@
 
 def get_config(model: str,
                trust_remote_code: bool,
-               revision: Optional[str] = None) -> PretrainedConfig:
+               revision: Optional[str] = None,
+               code_revision: Optional[str] = None) -> PretrainedConfig:
     try:
         config = AutoConfig.from_pretrained(
-            model, trust_remote_code=trust_remote_code, revision=revision)
+            model,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            code_revision=code_revision)
     except ValueError as e:
         if (not trust_remote_code and
                 "requires you to execute the configuration file" in str(e)):
@@ -33,5 +37,7 @@ def get_config(model: str,
             raise e
     if config.model_type in _CONFIG_REGISTRY:
         config_class = _CONFIG_REGISTRY[config.model_type]
-        config = config_class.from_pretrained(model, revision=revision)
+        config = config_class.from_pretrained(model,
+                                              revision=revision,
+                                              code_revision=code_revision)
     return config

From 537c9755a736b4e206107a99e1c8961448a3d63b Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Sun, 18 Feb 2024 14:39:00 -0800
Subject: [PATCH 076/112] [Minor] Small fix to make distributed init logic in
 worker looks cleaner (#2905)

---
 vllm/worker/worker.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 29e4b16fe2594..9df518d155ec2 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -93,8 +93,6 @@ def init_model(self, cupy_port: Optional[int] = None) -> None:
         # Initialize the distributed environment.
         init_distributed_environment(self.parallel_config, self.rank,
                                      cupy_port, self.distributed_init_method)
-        if not self.parallel_config.disable_custom_all_reduce:
-            init_custom_ar()
         # Initialize the model.
         set_random_seed(self.model_config.seed)
 
@@ -288,6 +286,10 @@ def init_distributed_environment(
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
 
+    # Initialize a custom fast all-reduce implementation.
+    if not parallel_config.disable_custom_all_reduce:
+        init_custom_ar()
+
 
 def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
     # Check if the GPU supports the dtype.

From a61f0521b8d0d53a91951bb56789ead397d5cd83 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Sun, 18 Feb 2024 16:44:50 -0800
Subject: [PATCH 077/112] [Test] Add basic correctness test (#2908)

---
 .buildkite/test-pipeline.yaml                 | 12 +++++-
 .../test_basic_correctness.py                 | 38 +++++++++++++++++
 tests/conftest.py                             |  2 +
 .../test_basic_distributed_correctness.py     | 41 +++++++++++++++++++
 4 files changed, 91 insertions(+), 2 deletions(-)
 create mode 100644 tests/basic_correctness/test_basic_correctness.py
 create mode 100644 tests/distributed/test_basic_distributed_correctness.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 2e417ef940322..a91dcdfaf2ea5 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -11,8 +11,16 @@ steps:
 - label: AsyncEngine Test
   command: pytest -v -s async_engine
 
-- label: Distributed Test
-  command: pytest -v -s test_comm_ops.py
+- label: Basic Correctness Test
+  command: pytest -v -s --forked basic_correctness
+
+- label: Distributed Comm Ops Test
+  command: pytest -v -s --forked test_comm_ops.py
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 2 # only support 1 or 2 for now.
+
+- label: Distributed Correctness Test
+  command: pytest -v -s --forked test_basic_distributed_correctness.py
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
 
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
new file mode 100644
index 0000000000000..fe67e0f2f4808
--- /dev/null
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -0,0 +1,38 @@
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+Run `pytest tests/basic_correctness/test_basic_correctness.py --forked`.
+"""
+import pytest
+
+MODELS = [
+    "facebook/opt-125m",
+    "meta-llama/Llama-2-7b-hf",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
diff --git a/tests/conftest.py b/tests/conftest.py
index 8d6afdbd00358..941d48ec28441 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -165,6 +165,7 @@ def __init__(
         model_name: str,
         tokenizer_name: Optional[str] = None,
         dtype: str = "half",
+        tensor_parallel_size: int = 1,
     ) -> None:
         self.model = LLM(
             model=model_name,
@@ -172,6 +173,7 @@ def __init__(
             trust_remote_code=True,
             dtype=dtype,
             swap_space=0,
+            tensor_parallel_size=tensor_parallel_size,
         )
 
     def generate(
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
new file mode 100644
index 0000000000000..82075356fccbd
--- /dev/null
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -0,0 +1,41 @@
+"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
+
+Run `pytest tests/distributed/test_basic_distributed_correctness.py --forked`.
+"""
+import pytest
+import torch
+
+MODELS = [
+    "facebook/opt-125m",
+    "meta-llama/Llama-2-7b-hf",
+]
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(model, dtype=dtype, tensor_parallel_size=2)
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")

From ab3a5a8259922ce312d01be39d29e27666968039 Mon Sep 17 00:00:00 2001
From: Isotr0py <41363108+Isotr0py@users.noreply.github.com>
Date: Mon, 19 Feb 2024 13:05:15 +0800
Subject: [PATCH 078/112] Support OLMo models. (#2832)

---
 README.md                                   |   1 +
 docs/source/models/supported_models.rst     |   3 +
 tests/models/test_models.py                 |  19 +-
 vllm/model_executor/models/__init__.py      |   1 +
 vllm/model_executor/models/olmo.py          | 378 ++++++++++++++++++++
 vllm/transformers_utils/configs/__init__.py |   2 +
 vllm/transformers_utils/configs/olmo.py     |  72 ++++
 7 files changed, 471 insertions(+), 5 deletions(-)
 create mode 100644 vllm/model_executor/models/olmo.py
 create mode 100644 vllm/transformers_utils/configs/olmo.py

diff --git a/README.md b/README.md
index c48ddcfa0a79a..e0954f6cb329f 100644
--- a/README.md
+++ b/README.md
@@ -70,6 +70,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
 - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
 - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
+- OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.)
 - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
 - Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
 - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 5d7f401cc6e2c..8bc747770e098 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -62,6 +62,9 @@ Alongside each architecture, we include some popular models that use it.
   * - :code:`MPTForCausalLM`
     - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
     - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
+  * - :code:`OLMoForCausalLM`
+    - OLMo
+    - :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc.
   * - :code:`OPTForCausalLM`
     - OPT, OPT-IML
     - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index 40858a517b311..e44452e9893cf 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -5,11 +5,20 @@
 import pytest
 
 MODELS = [
-    "facebook/opt-125m", "meta-llama/Llama-2-7b-hf",
-    "mistralai/Mistral-7B-v0.1", "Deci/DeciLM-7b", "tiiuae/falcon-7b", "gpt2",
-    "bigcode/tiny_starcoder_py", "EleutherAI/gpt-j-6b",
-    "EleutherAI/pythia-70m", "bigscience/bloom-560m", "mosaicml/mpt-7b",
-    "microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"
+    "facebook/opt-125m",
+    "meta-llama/Llama-2-7b-hf",
+    "mistralai/Mistral-7B-v0.1",
+    "Deci/DeciLM-7b",
+    "tiiuae/falcon-7b",
+    "gpt2",
+    "bigcode/tiny_starcoder_py",
+    "EleutherAI/gpt-j-6b",
+    "EleutherAI/pythia-70m",
+    "bigscience/bloom-560m",
+    "mosaicml/mpt-7b",
+    "microsoft/phi-2",
+    "stabilityai/stablelm-3b-4e1t",
+    "allenai/OLMo-1B",
 ]
 
 
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 5cba1cf0414db..0f6a4bd9a4ad6 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -35,6 +35,7 @@
     # transformers's mpt class has lower case
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
+    "OLMoForCausalLM": ("olmo", "OLMoForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
new file mode 100644
index 0000000000000..2eb42935e8bfd
--- /dev/null
+++ b/vllm/model_executor/models/olmo.py
@@ -0,0 +1,378 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/allenai/OLMo/blob/v0.2.4/olmo/model.py and
+# https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/modeling_olmo.py
+# Copyright 2023 The vLLM team.
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""Inference-only OLMo model compatible with HuggingFace weights."""
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    LinearMethodBase,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size, )
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.weight_utils import (
+    default_weight_loader,
+    hf_model_weights_iterator,
+)
+from vllm.sequence import SamplerOutput
+from vllm.transformers_utils.configs.olmo import OLMoConfig
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class SwiGLU(nn.Module):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, gate = x.chunk(2, dim=-1)
+        return F.silu(gate) * x
+
+    @property
+    def output_multiplier(self) -> float:
+        return 0.5
+
+
+class OlmoAttention(nn.Module):
+    """
+    This is the attention block where the output is computed as ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: OLMoConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.d_model
+        assert config.d_model % config.n_heads == 0
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
+        )
+        self.total_num_heads = self.config.n_heads
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = self.hidden_size // self.total_num_heads
+
+        # Layer norms.
+        self.attn_norm = nn.LayerNorm(config.d_model,
+                                      elementwise_affine=False,
+                                      bias=False)
+        # Attention input projection. Projects x -> (q, k, v)
+        self.att_proj = QKVParallelLinear(
+            config.d_model,
+            self.head_dim,
+            self.total_num_heads,
+            bias=config.include_bias,
+            linear_method=linear_method,
+        )
+
+        # Rotary embeddings.
+        if self.config.rope:
+            rope_theta = getattr(config, "rope_theta", 10000)
+            max_position_embeddings = getattr(config,
+                                              "max_position_embeddings", 8192)
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=max_position_embeddings,
+                base=rope_theta,
+            )
+        self.scaling = self.head_dim**-0.5
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   scale=self.scaling)
+
+        # Attention output projection.
+        self.attn_out = RowParallelLinear(
+            config.d_model,
+            config.d_model,
+            bias=config.include_bias,
+            linear_method=linear_method,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.attn_norm(hidden_states)
+        qkv, _ = self.att_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        if self.config.rope:
+            q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        output, _ = self.attn_out(attn_output)
+        return output
+
+
+class OlmoMLP(nn.Module):
+    """
+    This is the MLP block where the output is computed as ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: OLMoConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = (config.mlp_hidden_size if config.mlp_hidden_size
+                            is not None else config.mlp_ratio * config.d_model)
+
+        # Layer norms.
+        self.ff_norm = nn.LayerNorm(config.d_model,
+                                    elementwise_affine=False,
+                                    bias=False)
+
+        # Feed-forward input projection.
+        self.ff_proj = ColumnParallelLinear(
+            config.d_model,
+            self.hidden_size,
+            bias=config.include_bias,
+            linear_method=linear_method,
+        )
+
+        # Activation function.
+        # self.act = SiluAndMul()
+        # self.act.output_multiplier = 0.5
+        self.act = SwiGLU()
+        assert (self.act.output_multiplier * self.hidden_size) % 1 == 0
+
+        # Feed-forward output projection.
+        self.ff_out = RowParallelLinear(
+            int(self.act.output_multiplier * self.hidden_size),
+            config.d_model,
+            bias=config.include_bias,
+            linear_method=linear_method,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        # Add feed-forward projection.
+        # shape: (batch_size, seq_len, d_model)
+        og_x = x
+        x = self.ff_norm(x)
+        x, _ = self.ff_proj(x)
+        x = self.act(x)
+        x, _ = self.ff_out(x)
+        x = og_x + x
+
+        return x
+
+
+class OlmoBlock(nn.Module):
+    """
+    This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(self,
+                 config: OLMoConfig,
+                 linear_method: Optional[LinearMethodBase] = None):
+        super().__init__()
+        # Attention block.
+        self.attn = OlmoAttention(config, linear_method)
+
+        # MLP block.
+        self.mlp = OlmoMLP(config, linear_method)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Attention block.
+        og_x = hidden_states
+        x = self.attn(positions, hidden_states, kv_cache, input_metadata)
+        x = x + og_x
+
+        # MLP block.
+        hidden_states = self.mlp(x)
+        return hidden_states
+
+
+class OlmoModel(nn.Module):
+
+    def __init__(self,
+                 config: OLMoConfig,
+                 linear_method: Optional[LinearMethodBase] = None):
+        super().__init__()
+        self.config = config
+
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=VocabParallelEmbedding(
+                    config.embedding_size or config.vocab_size,
+                    config.d_model,
+                ),
+                ln_f=nn.LayerNorm(config.d_model,
+                                  elementwise_affine=False,
+                                  bias=False),
+            ))
+
+        blocks = [
+            OlmoBlock(config, linear_method) for i in range(config.n_layers)
+        ]
+        if self.config.block_group_size > 1:
+            raise NotImplementedError("Block group size > 1 not supported yet")
+        else:
+            self.transformer.update({"blocks": nn.ModuleList(blocks)})
+
+        if not config.weight_tying:
+            self.transformer.update({
+                "ff_out":
+                ColumnParallelLinear(
+                    config.d_model,
+                    config.embedding_size or config.vocab_size,
+                    bias=config.include_bias,
+                    linear_method=linear_method,
+                )
+            })
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        """
+        # Get embeddings of input.
+        # shape: (batch_size, seq_len, d_model)
+        x = self.transformer.wte(input_ids)  # type: ignore
+
+        # Apply blocks one-by-one.
+        for block_idx, block in enumerate(self.transformer.blocks):
+            # shape: (batch_size, seq_len, d_model)
+            x = block(
+                positions,
+                x,
+                kv_caches[block_idx],
+                input_metadata,
+            )
+
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        x = self.transformer.ln_f(x)  # type: ignore
+        return x
+
+
+class OLMoForCausalLM(nn.Module):
+    """
+    Extremely barebones HF model wrapper.
+    """
+
+    def __init__(self,
+                 config: OLMoConfig,
+                 linear_method: Optional[LinearMethodBase] = None):
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = OlmoModel(config, linear_method)
+        self.lm_head_weight = (self.model.transformer.wte.weight
+                               if config.weight_tying else
+                               self.model.transformer.ff_out.weight)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            input_metadata=input_metadata,
+        )
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
+                                   sampling_metadata)
+        return next_tokens
+
+    def load_weights(
+        self,
+        model_name_or_path: str,
+        cache_dir: Optional[str] = None,
+        load_format: str = "auto",
+        revision: Optional[str] = None,
+    ):
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, load_format, revision):
+            # attention
+            if ".att" in name:
+                name = name.replace(".att", ".attn.att")
+            # mlp
+            if ".ff" in name and "transformer.ff_out" not in name:
+                name = name.replace(".ff", ".mlp.ff")
+            # there is no bias in olmo
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index bbba741ca536a..47bcc2b9594be 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,6 +1,7 @@
 from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
+from vllm.transformers_utils.configs.olmo import OLMoConfig
 from vllm.transformers_utils.configs.qwen import QWenConfig
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
@@ -11,6 +12,7 @@
     "BaiChuanConfig",
     "ChatGLMConfig",
     "MPTConfig",
+    "OLMoConfig",
     "QWenConfig",
     "RWConfig",
 ]
diff --git a/vllm/transformers_utils/configs/olmo.py b/vllm/transformers_utils/configs/olmo.py
new file mode 100644
index 0000000000000..a9dfc6ec88ca6
--- /dev/null
+++ b/vllm/transformers_utils/configs/olmo.py
@@ -0,0 +1,72 @@
+# coding=utf-8
+# adapted from https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/configuration_olmo.py
+"""OLMo configuration"""
+from transformers import PretrainedConfig
+
+
+class OLMoConfig(PretrainedConfig):
+    model_type = 'olmo'
+    attribute_map = {
+        'num_attention_heads': 'n_heads',
+        'hidden_size': 'd_model',
+        'num_hidden_layers': 'n_layers',
+    }
+
+    # Note that the defaults for these attributes are equivalent to the base GPT2 model.
+    def __init__(
+        self,
+        d_model=768,
+        n_heads=12,
+        n_layers=12,
+        mlp_ratio=4,
+        mlp_hidden_size=None,
+        activation_type="swiglu",
+        block_type="sequential",
+        block_group_size=1,
+        alibi=False,
+        alibi_bias_max=8.0,
+        rope=False,
+        rope_full_precision=True,
+        multi_query_attention=False,
+        attention_layer_norm=False,
+        layer_norm_type="default",
+        layer_norm_with_affine=True,
+        attention_layer_norm_with_affine=True,
+        max_sequence_length=1024,
+        include_bias=True,
+        bias_for_layer_norm=None,
+        scale_logits=False,
+        vocab_size=50257,
+        embedding_size=50304,
+        weight_tying=True,
+        eos_token_id=50256,
+        pad_token_id=50256,
+        **kwargs,
+    ):
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.mlp_ratio = mlp_ratio
+        self.mlp_hidden_size = mlp_hidden_size
+        self.activation_type = activation_type
+        self.block_type = block_type
+        self.block_group_size = block_group_size
+        self.alibi = alibi
+        self.alibi_bias_max = alibi_bias_max
+        self.rope = rope
+        self.rope_full_precision = rope_full_precision
+        self.multi_query_attention = multi_query_attention
+        self.attention_layer_norm = attention_layer_norm
+        self.layer_norm_type = layer_norm_type
+        self.layer_norm_with_affine = layer_norm_with_affine
+        self.attention_layer_norm_with_affine = attention_layer_norm_with_affine
+        self.max_sequence_length = max_sequence_length
+        self.include_bias = include_bias
+        self.bias_for_layer_norm = bias_for_layer_norm
+        self.scale_logits = scale_logits
+        self.vocab_size = vocab_size
+        self.embedding_size = embedding_size
+        self.weight_tying = weight_tying
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        super().__init__(**kwargs)

From 86fd8bb0ac9a836e55b5075d8416bd067af9e7b2 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sun, 18 Feb 2024 21:36:19 -0800
Subject: [PATCH 079/112] Add warning to prevent changes to benchmark api
 server (#2858)

---
 vllm/entrypoints/api_server.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index f7b8d258fae4c..e7af2c6db5e4c 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -1,3 +1,9 @@
+"""
+NOTE: This API server is used only for demonstrating usage of AsyncEngine and simple performance benchmarks.
+It is not intended for production use. For production use, we recommend using our OpenAI compatible server.
+We are also not going to accept PRs modifying this file, please change `vllm/entrypoints/openai/api_server.py` instead.
+"""
+
 import argparse
 import json
 from typing import AsyncGenerator

From e433c115bce2bf27f7b1abdde7029566007d9eee Mon Sep 17 00:00:00 2001
From: Ronen Schaffer <ronen.schaffer@ibm.com>
Date: Mon, 19 Feb 2024 09:55:41 +0200
Subject: [PATCH 080/112] Fix `vllm:prompt_tokens_total` metric calculation
 (#2869)

---
 tests/conftest.py             | 10 +++++-----
 tests/metrics/test_metrics.py | 33 +++++++++++++++++++++++++++++++++
 vllm/engine/llm_engine.py     |  4 +++-
 3 files changed, 41 insertions(+), 6 deletions(-)
 create mode 100644 tests/metrics/test_metrics.py

diff --git a/tests/conftest.py b/tests/conftest.py
index 941d48ec28441..6af9b36b6febe 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,12 +13,10 @@
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
 
 
-def _read_prompts(filename: str) -> str:
-    prompts = []
+def _read_prompts(filename: str) -> List[str]:
     with open(filename, "r") as f:
-        prompt = f.readline()
-        prompts.append(prompt)
-    return prompts
+        prompts = f.readlines()
+        return prompts
 
 
 @pytest.fixture
@@ -165,6 +163,7 @@ def __init__(
         model_name: str,
         tokenizer_name: Optional[str] = None,
         dtype: str = "half",
+        disable_log_stats: bool = True,
         tensor_parallel_size: int = 1,
     ) -> None:
         self.model = LLM(
@@ -173,6 +172,7 @@ def __init__(
             trust_remote_code=True,
             dtype=dtype,
             swap_space=0,
+            disable_log_stats=disable_log_stats,
             tensor_parallel_size=tensor_parallel_size,
         )
 
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
new file mode 100644
index 0000000000000..da608a6a18f92
--- /dev/null
+++ b/tests/metrics/test_metrics.py
@@ -0,0 +1,33 @@
+import pytest
+import vllm.engine.metrics
+
+MODELS = [
+    "facebook/opt-125m",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_metrics(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    vllm_model = vllm_runner(model, dtype=dtype, disable_log_stats=False)
+    tokenizer = vllm_model.model.get_tokenizer()
+    prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
+    # This test needs at least 2 prompts in a batch of different lengths to verify their token count is correct despite padding.
+    assert len(example_prompts) > 1, "at least 2 prompts are required"
+    assert prompt_token_counts[0] != prompt_token_counts[1], (
+        "prompts of different lengths are required")
+    vllm_prompt_token_count = sum(prompt_token_counts)
+
+    _ = vllm_model.generate_greedy(example_prompts, max_tokens)
+    metric_count = vllm.engine.metrics.counter_prompt_tokens.get_value({})
+
+    assert vllm_prompt_token_count == metric_count, (
+        f"prompt token count: {vllm_prompt_token_count!r}\nmetric: {metric_count!r}"
+    )
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 29fa952665397..2fa04f72d7951 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -867,7 +867,9 @@ def _get_stats(self,
 
             # Number of Tokens.
             if prompt_run:
-                num_prompt_tokens = scheduler_outputs.num_batched_tokens
+                num_prompt_tokens = sum(
+                    len(seq_group.prompt_token_ids)
+                    for seq_group in scheduler_outputs.scheduled_seq_groups)
             else:
                 num_generation_tokens = scheduler_outputs.num_batched_tokens
 

From 264017a2bf030f060ebad91eb9be9b4e0033edb9 Mon Sep 17 00:00:00 2001
From: James Whedbee <jamestwhedbee@gmail.com>
Date: Mon, 19 Feb 2024 19:58:59 -0600
Subject: [PATCH 081/112] [ROCm] include gfx908 as supported (#2792)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ea58a1a49e7e3..8fcb86394f76d 100644
--- a/setup.py
+++ b/setup.py
@@ -24,7 +24,7 @@
 
 # Supported NVIDIA GPU architectures.
 NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
-ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx942", "gfx1100"}
+ROCM_SUPPORTED_ARCHS = {"gfx908", "gfx90a", "gfx942", "gfx1100"}
 # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
 
 

From 63e2a6419dc5863311a11d1d2a95cda9fc8ef7e5 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Tue, 20 Feb 2024 14:37:39 -0800
Subject: [PATCH 082/112] [FIX] Fix beam search test (#2930)

---
 tests/samplers/test_beam_search.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index a491ffa763505..9398aeb2c214c 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -26,6 +26,7 @@ def test_beam_search_single_input(
     max_tokens: int,
     beam_width: int,
 ) -> None:
+    example_prompts = example_prompts[:1]
     hf_model = hf_runner(model, dtype=dtype)
     hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
                                                max_tokens)

From 181b27d8813e6a92de4f38cecfa24914e652588a Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 20 Feb 2024 14:38:55 -0800
Subject: [PATCH 083/112] Make vLLM logging formatting optional (#2877)

---
 vllm/logger.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/logger.py b/vllm/logger.py
index 530494ae66925..d25fcef9ba2ee 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -5,6 +5,8 @@
 import sys
 import os
 
+VLLM_CONFIGURE_LOGGING = int(os.getenv("VLLM_CONFIGURE_LOGGING", "1"))
+
 _FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
 _DATE_FORMAT = "%m-%d %H:%M:%S"
 
@@ -45,13 +47,15 @@ def _setup_logger():
 # The logger is initialized when the module is imported.
 # This is thread-safe as the module is only imported once,
 # guaranteed by the Python GIL.
-_setup_logger()
+if VLLM_CONFIGURE_LOGGING:
+    _setup_logger()
 
 
 def init_logger(name: str):
     # Use the same settings as above for root logger
     logger = logging.getLogger(name)
     logger.setLevel(os.getenv("LOG_LEVEL", "DEBUG"))
-    logger.addHandler(_default_handler)
-    logger.propagate = False
+    if VLLM_CONFIGURE_LOGGING:
+        logger.addHandler(_default_handler)
+        logger.propagate = False
     return logger

From 017d9f15151ce571a5f4fd381699c72a872636ec Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 20 Feb 2024 21:55:57 -0800
Subject: [PATCH 084/112] Add metrics to RequestOutput (#2876)

---
 tests/async_engine/test_request_tracker.py |  2 +-
 vllm/core/policy.py                        |  2 +-
 vllm/core/scheduler.py                     |  3 ++
 vllm/engine/llm_engine.py                  |  7 +++-
 vllm/outputs.py                            | 10 ++++-
 vllm/sequence.py                           | 46 ++++++++++++++++++++--
 6 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
index 3e4d53c5cbe23..4043558bae919 100644
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -64,7 +64,7 @@ def test_request_tracker():
     stream_5 = tracker.add_request("5")
     assert tracker.new_requests_event.flag
     tracker.process_request_output(
-        RequestOutput("2", "output", [], [], [], finished=True))
+        RequestOutput("2", "output", [], [], [], bool(finished)))
     new, finished = tracker.get_new_and_finished_requests()
     assert not tracker.new_requests_event.flag
     assert len(finished) == 1
diff --git a/vllm/core/policy.py b/vllm/core/policy.py
index 99f183b42c8b4..2e9ebbda54412 100644
--- a/vllm/core/policy.py
+++ b/vllm/core/policy.py
@@ -33,7 +33,7 @@ def get_priority(
         now: float,
         seq_group: SequenceGroup,
     ) -> float:
-        return now - seq_group.arrival_time
+        return now - seq_group.metrics.arrival_time
 
 
 class PolicyFactory:
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 4fdf9ec341cfd..5dde9097a3d57 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -365,10 +365,13 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
         # This function call changes the internal states of the scheduler
         # such as self.running, self.swapped, and self.waiting.
         scheduler_outputs = self._schedule()
+        now = time.time()
 
         # Create input data structures.
         seq_group_metadata_list: List[SequenceGroupMetadata] = []
         for seq_group in scheduler_outputs.scheduled_seq_groups:
+            seq_group.maybe_set_first_scheduled_time(now)
+
             seq_data: Dict[int, SequenceData] = {}
             block_tables: Dict[int, List[int]] = {}
             for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 2fa04f72d7951..f0de40f54db61 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -728,6 +728,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
     def _process_model_outputs(
             self, output: SamplerOutput,
             scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
+        now = time.time()
         # Update the scheduled sequence groups with the model outputs.
         scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
         for seq_group, outputs in zip(scheduled_seq_groups, output):
@@ -739,6 +740,7 @@ def _process_model_outputs(
         # Create the outputs.
         request_outputs: List[RequestOutput] = []
         for seq_group in scheduled_seq_groups:
+            seq_group.maybe_set_first_token_time(now)
             request_output = RequestOutput.from_seq_group(seq_group)
             request_outputs.append(request_output)
         for seq_group in scheduler_outputs.ignored_seq_groups:
@@ -876,11 +878,12 @@ def _get_stats(self,
             # Latency Timings.
             time_last_iters = []
             for seq_group in scheduler_outputs.scheduled_seq_groups:
-                # Time since last token. (n.b. updates seq_group.last_token_time)
+                # Time since last token. (n.b. updates seq_group.metrics.last_token_time)
                 time_last_iters.append(seq_group.get_last_latency(now))
                 # Time since arrival for all finished requests.
                 if seq_group.is_finished():
-                    time_e2e_requests.append(now - seq_group.arrival_time)
+                    time_e2e_requests.append(now -
+                                             seq_group.metrics.arrival_time)
 
             time_to_first_tokens = time_last_iters if prompt_run else []
             time_per_output_tokens = [] if prompt_run else time_last_iters
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 534e9d5ea8a53..a6de2a5a2257b 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,7 +1,8 @@
 from typing import List, Optional
+import time
 
 from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup,
-                           SequenceStatus)
+                           SequenceStatus, RequestMetrics)
 from vllm.lora.request import LoRARequest
 
 
@@ -60,6 +61,7 @@ class RequestOutput:
         prompt_logprobs: The log probabilities to return per prompt token.
         outputs: The output sequences of the request.
         finished: Whether the whole request is finished.
+        metrics: Metrics associated with the request.
         lora_request: The LoRA request that was used to generate the output.
     """
 
@@ -71,6 +73,7 @@ def __init__(
         prompt_logprobs: Optional[PromptLogprobs],
         outputs: List[CompletionOutput],
         finished: bool,
+        metrics: Optional[RequestMetrics] = None,
         lora_request: Optional[LoRARequest] = None,
     ) -> None:
         self.request_id = request_id
@@ -79,6 +82,7 @@ def __init__(
         self.prompt_logprobs = prompt_logprobs
         self.outputs = outputs
         self.finished = finished
+        self.metrics = metrics
         self.lora_request = lora_request
 
     @classmethod
@@ -115,12 +119,15 @@ def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
         prompt_token_ids = seq_group.prompt_token_ids
         prompt_logprobs = seq_group.prompt_logprobs
         finished = seq_group.is_finished()
+        finished_time = time.time() if finished else None
+        seq_group.set_finished_time(finished_time)
         return cls(seq_group.request_id,
                    prompt,
                    prompt_token_ids,
                    prompt_logprobs,
                    outputs,
                    finished,
+                   seq_group.metrics,
                    lora_request=seq_group.lora_request)
 
     def __repr__(self) -> str:
@@ -130,4 +137,5 @@ def __repr__(self) -> str:
                 f"prompt_logprobs={self.prompt_logprobs}, "
                 f"outputs={self.outputs}, "
                 f"finished={self.finished}, "
+                f"metrics={self.metrics}, "
                 f"lora_request={self.lora_request})")
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 9669562cfeac5..44adb058a5ba5 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1,6 +1,7 @@
 """Sequence and its related classes."""
 import copy
 import enum
+from dataclasses import dataclass
 from typing import Dict, List, Optional, Union
 
 from vllm.block import LogicalTokenBlock
@@ -49,6 +50,25 @@ def get_finished_reason(status: "SequenceStatus") -> Union[str, None]:
         return finish_reason
 
 
+@dataclass
+class RequestMetrics:
+    """Metrics associated with a request.
+
+    Args:
+        arrival_time: The time when the request arrived.
+        first_scheduled_time: The time when the request was first scheduled.
+        first_token_time: The time when the first token was generated.
+        time_in_queue: The time the request spent in the queue.
+        finished_time: The time when the request was finished.
+    """
+    arrival_time: float
+    last_token_time: float
+    first_scheduled_time: Optional[float]
+    first_token_time: Optional[float]
+    time_in_queue: Optional[float]
+    finished_time: Optional[float] = None
+
+
 class SequenceData:
     """Data associated with a sequence.
 
@@ -252,8 +272,11 @@ def __init__(
         self.request_id = request_id
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
         self.sampling_params = sampling_params
-        self.arrival_time = arrival_time
-        self.last_token_time = arrival_time
+        self.metrics = RequestMetrics(arrival_time=arrival_time,
+                                      last_token_time=arrival_time,
+                                      first_scheduled_time=None,
+                                      first_token_time=None,
+                                      time_in_queue=None)
         self.lora_request = lora_request
         self.prefix: Optional[Prefix] = prefix
         self.prompt_logprobs: Optional[PromptLogprobs] = None
@@ -276,10 +299,25 @@ def lora_int_id(self) -> int:
 
     def get_last_latency(self, now: float) -> float:
         """Gets last token latency for Request level timings."""
-        latency = now - self.last_token_time
-        self.last_token_time = now
+        latency = now - self.metrics.last_token_time
+        self.metrics.last_token_time = now
         return latency
 
+    def maybe_set_first_token_time(self, time: float) -> None:
+        """Sets the first token time for Request level timings."""
+        if self.metrics.first_token_time is None:
+            self.metrics.first_token_time = time
+
+    def maybe_set_first_scheduled_time(self, time: float) -> None:
+        """Sets the first scheduled time and time in queue for Request level timings."""
+        if self.metrics.first_scheduled_time is None:
+            self.metrics.first_scheduled_time = time
+            self.metrics.time_in_queue = time - self.metrics.arrival_time
+
+    def set_finished_time(self, time: Optional[float]) -> None:
+        """Sets the finished time for Request level timings."""
+        self.metrics.finished_time = time
+
     def get_max_num_running_seqs(self) -> int:
         """The maximum number of sequences running in parallel in the remaining
         lifetime of the request."""

From 5253edaacb3d023fad83d0549d525dd404ff1a26 Mon Sep 17 00:00:00 2001
From: Xiang Xu <117880274+xiangxu-google@users.noreply.github.com>
Date: Wed, 21 Feb 2024 09:34:30 -0800
Subject: [PATCH 085/112] Add Gemma model (#2964)

---
 vllm/model_executor/models/__init__.py |   1 +
 vllm/model_executor/models/gemma.py    | 333 +++++++++++++++++++++++++
 2 files changed, 334 insertions(+)
 create mode 100644 vllm/model_executor/models/gemma.py

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 0f6a4bd9a4ad6..17d8d69ba8672 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -20,6 +20,7 @@
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
+    "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
     "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
     "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
new file mode 100644
index 0000000000000..affe54c448a2c
--- /dev/null
+++ b/vllm/model_executor/models/gemma.py
@@ -0,0 +1,333 @@
+# coding=utf-8
+# Copyright 2023 The vLLM team.
+# Copyright (c) Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Gemma model compatible with HuggingFace weights."""
+from typing import List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import GemmaConfig
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               LinearMethodBase,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.weight_utils import (default_weight_loader,
+                                              hf_model_weights_iterator)
+from vllm.sequence import SamplerOutput
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class GemmaRMSNorm(nn.Module):
+
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * (1 + self.weight)
+
+
+class GemmaMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_proj = ColumnParallelLinear(hidden_size,
+                                              intermediate_size,
+                                              bias=False,
+                                              linear_method=linear_method)
+        self.up_proj = ColumnParallelLinear(hidden_size,
+                                            intermediate_size,
+                                            bias=False,
+                                            linear_method=linear_method)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           linear_method=linear_method)
+        self.act_fn = nn.GELU()
+
+    def forward(self, x):
+        gate, _ = self.gate_proj(x)
+        gate = self.act_fn(gate)
+        up, _ = self.up_proj(x)
+        fuse = gate * up
+        outputs, _ = self.down_proj(fuse)
+        return outputs
+
+
+class GemmaAttention(nn.Module):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 head_dim: int,
+                 max_position_embeddings: int = 8192,
+                 rope_theta: float = 10000,
+                 linear_method: Optional[LinearMethodBase] = None) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            linear_method=linear_method,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=self.rope_theta,
+            is_neox_style=True,
+        )
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   self.scaling,
+                                   num_kv_heads=self.num_kv_heads)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class GemmaDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: GemmaConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = GemmaAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            rope_theta=config.rope_theta,
+            linear_method=linear_method,
+        )
+        self.mlp = GemmaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            linear_method=linear_method,
+        )
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
+                                            eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                     eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class GemmaModel(nn.Module):
+
+    def __init__(
+        self,
+        config: GemmaConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            GemmaDecoderLayer(config, linear_method)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        # Normalize the embedding by sqrt(hidden_size)
+        hidden_states = hidden_states * (self.config.hidden_size**0.5)
+
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class GemmaForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: GemmaConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = GemmaModel(config, linear_method)
+        self.sampler = Sampler(config.vocab_size)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.model.embed_tokens.weight,
+                                   hidden_states, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     load_format: str = "auto",
+                     revision: Optional[str] = None):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params = set()
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, load_format, revision):
+            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra layer for lora models.
+                if "lm_head" in name:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            raise RuntimeError(
+                f"Some weights are not initialized from checkpoints: {unloaded_params}"
+            )

From c20ecb6a51cb58d408eb5ae7b03ac76c7b83e609 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 21 Feb 2024 09:38:03 -0800
Subject: [PATCH 086/112] Upgrade transformers to v4.38.0 (#2965)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4fcfe661185b5..de08bd29beaf9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 torch == 2.1.2
-transformers >= 4.37.0 # Required for Qwen2
+transformers >= 4.38.0  # Required for Gemma.
 xformers == 0.0.23.post1  # Required for CUDA 12.1.
 fastapi
 uvicorn[standard]

From a9c821289582747c57f149017678a282f5e788e4 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Wed, 21 Feb 2024 09:46:15 -0800
Subject: [PATCH 087/112] [FIX] Add Gemma model to the doc (#2966)

---
 README.md                               | 1 +
 docs/source/models/supported_models.rst | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/README.md b/README.md
index e0954f6cb329f..7a16bb1fef044 100644
--- a/README.md
+++ b/README.md
@@ -60,6 +60,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
 - DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
 - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
+- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
 - GPT-2 (`gpt2`, `gpt2-xl`, etc.)
 - GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
 - GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 8bc747770e098..c1639ca9e056a 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -32,6 +32,9 @@ Alongside each architecture, we include some popular models that use it.
   * - :code:`FalconForCausalLM`
     - Falcon
     - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
+  * - :code:`GemmaForCausalLM`
+    - Gemma
+    - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
   * - :code:`GPT2LMHeadModel`
     - GPT-2
     - :code:`gpt2`, :code:`gpt2-xl`, etc.

From dc903e70acf9dba74d6afaa50e7b5650d6b9338a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 21 Feb 2024 09:46:57 -0800
Subject: [PATCH 088/112] [ROCm] Upgrade transformers to v4.38.0 (#2967)

---
 requirements-rocm.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index 7b42ee067310b..42b89ae84aa45 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -6,7 +6,7 @@ ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 tokenizers>=0.15.0
-transformers >= 4.37.0  # Required for Mixtral.
+transformers >= 4.38.0  # Required for Gemma.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.

From 7d2dcce175cec00bc1d127d6b3a5f1ef73a6ba3c Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Wed, 21 Feb 2024 11:47:00 -0800
Subject: [PATCH 089/112] Support per-request seed (#2514)

---
 tests/samplers/test_sampler.py           | 222 +++++++++++++++--------
 tests/samplers/test_seeded_generate.py   |  82 +++++++++
 vllm/core/scheduler.py                   |   1 +
 vllm/engine/arg_utils.py                 |   1 -
 vllm/entrypoints/openai/protocol.py      |   4 +
 vllm/model_executor/layers/sampler.py    |  29 ++-
 vllm/model_executor/sampling_metadata.py |   3 +
 vllm/sampling_params.py                  |   9 +-
 vllm/sequence.py                         |  12 ++
 vllm/worker/model_runner.py              |  10 +
 10 files changed, 289 insertions(+), 84 deletions(-)
 create mode 100644 tests/samplers/test_seeded_generate.py

diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index d34f32d03fee0..31e865f42ff3b 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -1,10 +1,11 @@
 import random
-from typing import Tuple
+from typing import Tuple, List
 from unittest.mock import patch
 
 import pytest
 import torch
 from transformers import GenerationConfig, GenerationMixin
+from typing import Optional
 
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.utils import set_random_seed
@@ -46,15 +47,13 @@ def _prepare_test(
 ]
 
 
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_greedy(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
-        batch_size)
-
+def _do_sample(
+    batch_size: int,
+    input_tensor: torch.Tensor,
+    sampler: MockLogitsSampler,
+    model_runner: ModelRunner,
+    sampling_params: SamplingParams,
+):
     seq_group_metadata_list = []
     prompt_lens = []
     for i in range(batch_size):
@@ -63,7 +62,7 @@ def test_sampler_all_greedy(seed: int, device: str):
                 request_id=f"test_{i}",
                 is_prompt=True,
                 seq_data={0: SequenceData([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0, ),
+                sampling_params=sampling_params,
                 block_tables={0: [1]},
             ))
         prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
@@ -71,9 +70,23 @@ def test_sampler_all_greedy(seed: int, device: str):
     sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
                                                      prompt_lens,
                                                      subquery_lens=prompt_lens)
-    sampler_output = sampler(embedding=None,
-                             hidden_states=input_tensor,
-                             sampling_metadata=sampling_metadata)
+    return sampler(embedding=None,
+                   hidden_states=input_tensor,
+                   sampling_metadata=sampling_metadata)
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_greedy(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
+        batch_size)
+
+    sampling_params = SamplingParams(temperature=0)
+    sampler_output = _do_sample(batch_size, input_tensor, sampler,
+                                model_runner, sampling_params)
     expected = torch.argmax(fake_logits, dim=-1)
     for i, sequence_output in enumerate(sampler_output):
         for nth_output in sequence_output.samples:
@@ -94,28 +107,40 @@ def test_sampler_all_random(seed: int, device: str):
     for i in range(batch_size):
         fake_logits[i, i] = 1e2
 
-    seq_group_metadata_list = []
-    prompt_lens = []
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+    )
+    sampler_output = _do_sample(batch_size, input_tensor, sampler,
+                                model_runner, sampling_params)
+
+    for i, sequence_output in enumerate(sampler_output):
+        for nth_output in sequence_output.samples:
+            assert nth_output.output_token == i
+
+    del model_runner
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random_seed(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
+        batch_size)
+
     for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData([1, 2, 3])},
-                sampling_params=SamplingParams(
-                    temperature=1.0,
-                    n=random.randint(1, 10),
-                ),
-                block_tables={0: [1]},
-            ))
-        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+        fake_logits[i, i] = 1e2
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    sampler_output = _do_sample(batch_size, input_tensor, sampler,
+                                model_runner, sampling_params)
 
-    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
-                                                     prompt_lens,
-                                                     subquery_lens=prompt_lens)
-    sampler_output = sampler(embedding=None,
-                             hidden_states=input_tensor,
-                             sampling_metadata=sampling_metadata)
     for i, sequence_output in enumerate(sampler_output):
         for nth_output in sequence_output.samples:
             assert nth_output.output_token == i
@@ -123,6 +148,31 @@ def test_sampler_all_random(seed: int, device: str):
     del model_runner
 
 
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random_seed_deterministic(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
+        batch_size)
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    first_sampler_output = _do_sample(batch_size, input_tensor, sampler,
+                                      model_runner, sampling_params)
+
+    second_sampler_output = _do_sample(batch_size, input_tensor, sampler,
+                                       model_runner, sampling_params)
+
+    assert first_sampler_output == second_sampler_output
+
+    del model_runner
+
+
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_sampler_all_beam(seed: int, device: str):
@@ -131,29 +181,13 @@ def test_sampler_all_beam(seed: int, device: str):
     batch_size = random.randint(1, 256)
     input_tensor, _, sampler, model_runner = _prepare_test(batch_size)
 
-    seq_group_metadata_list = []
-    prompt_lens = []
-    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData([1, 2, 3])},
-                sampling_params=SamplingParams(
-                    temperature=0,
-                    best_of=2,
-                    use_beam_search=True,
-                ),
-                block_tables={0: [1]},
-            ))
-        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
-                                                     prompt_lens,
-                                                     subquery_lens=prompt_lens)
-    sampler(embedding=None,
-            hidden_states=input_tensor,
-            sampling_metadata=sampling_metadata)
+    sampling_params = SamplingParams(
+        temperature=0,
+        best_of=2,
+        use_beam_search=True,
+    )
+    _do_sample(batch_size, input_tensor, sampler, model_runner,
+               sampling_params)
     # no assertion here as I am not sure how to determine whether
     # the outputs are expected - in other words, this just tests
     # whether there are no exceptions in the sampler
@@ -171,14 +205,15 @@ def test_sampler_mixed(seed: int, device: str):
         batch_size)
 
     seq_group_metadata_list = []
-    expected_tokens = []
+    expected_tokens: List[Optional[List[int]]] = []
     prompt_lens = []
     for i in range(batch_size):
-        n = 1
-        sampling_type = random.randint(0, 2)
+        expected: Optional[List[int]] = None
+        sampling_type = random.randint(0, 3)
         if sampling_type == 0:
             sampling_params = SamplingParams(temperature=0)
-        elif sampling_type == 1:
+            expected = [torch.argmax(fake_logits[i], dim=-1).item()]
+        elif sampling_type in (1, 2):
             n = random.randint(1, 10)
             sampling_params = SamplingParams(
                 temperature=random.random() + 0.1,
@@ -187,13 +222,17 @@ def test_sampler_mixed(seed: int, device: str):
                 n=n,
                 presence_penalty=random.randint(0, 1),
             )
+            if sampling_type == 2:
+                sampling_params.seed = random.randint(0, 10000)
+            else:
+                for idx in range(n):
+                    fake_logits[i, i + idx] = 1e2
+                expected = list(range(i, i + n))
         else:
             sampling_params = SamplingParams(temperature=0,
                                              use_beam_search=True,
                                              best_of=2)
-        for idx in range(n):
-            fake_logits[i, i + idx] = 1e2
-            expected_tokens.append(i + idx)
+        expected_tokens.append(expected)
         seq_group_metadata_list.append(
             SequenceGroupMetadata(
                 request_id=f"test_{i}",
@@ -204,17 +243,50 @@ def test_sampler_mixed(seed: int, device: str):
             ))
         prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
 
-    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
-                                                     prompt_lens,
-                                                     subquery_lens=prompt_lens)
-    sampler_output = sampler(embedding=None,
-                             hidden_states=input_tensor,
-                             sampling_metadata=sampling_metadata)
-    for i, sequence_output in enumerate(sampler_output):
-        if seq_group_metadata_list[i].sampling_params.use_beam_search:
-            continue
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token in expected_tokens
+    def test_sampling(model_runner: ModelRunner):
+        sampling_metadata = model_runner._prepare_sample(
+            seq_group_metadata_list, prompt_lens, subquery_lens=prompt_lens)
+        sampler_output = sampler(embedding=None,
+                                 hidden_states=input_tensor,
+                                 sampling_metadata=sampling_metadata)
+
+        for i, (sequence_output, metadata) in enumerate(
+                zip(sampler_output, seq_group_metadata_list)):
+            if metadata.sampling_params.use_beam_search:
+                continue
+
+            if metadata.sampling_params.seed is not None \
+                    and expected_tokens[i] is None:
+                # Record seeded random result to compare with results of second invocation
+                expected_tokens[i] = [
+                    nth_output.output_token
+                    for nth_output in sequence_output.samples
+                ]
+                continue
+
+            for n, nth_output in enumerate(sequence_output.samples):
+                if metadata.sampling_params.temperature == 0 or metadata.sampling_params.seed is not None:
+                    # Ensure exact matches for greedy or random with seed
+                    assert nth_output.output_token == expected_tokens[i][n]
+                else:
+                    # For non-seeded random check that one of the high-logit tokens were chosen
+                    assert nth_output.output_token in expected_tokens[i]
+
+    # Test batch
+    test_sampling(model_runner)
+
+    # Shuffle the batch and resample
+    target_index = list(range(batch_size))
+    for list_to_shuffle in (target_index, seq_group_metadata_list,
+                            expected_tokens, prompt_lens):
+        random.Random(seed).shuffle(list_to_shuffle)
+    target_index = torch.tensor(target_index)
+    input_tensor.data = input_tensor.index_select(0, target_index)
+    fake_logits.data = fake_logits.index_select(0, target_index)
+
+    # This time, results of seeded random samples will be compared with the corresponding
+    # sample in the pre-shuffled batch
+    test_sampling(model_runner)
 
     del model_runner
 
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
new file mode 100644
index 0000000000000..fcb0e09d46143
--- /dev/null
+++ b/tests/samplers/test_seeded_generate.py
@@ -0,0 +1,82 @@
+"""Verify that seeded random sampling is deterministic.
+
+Run `pytest tests/samplers/test_seeded_generate.py --forked`.
+"""
+import copy
+import random
+from itertools import combinations
+
+import pytest
+
+from vllm.model_executor.utils import set_random_seed
+from vllm import SamplingParams
+
+MODEL = "facebook/opt-125m"
+RANDOM_SEEDS = list(range(5))
+
+
+@pytest.fixture
+def vllm_model(vllm_runner):
+    vllm_model = vllm_runner(MODEL, dtype="half")
+    yield vllm_model
+    del vllm_model
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+def test_random_sample_with_seed(
+    vllm_model,
+    example_prompts,
+    seed: int,
+) -> None:
+    set_random_seed(seed)
+
+    sampling_params = SamplingParams(
+        # Parameters to ensure sufficient randomness
+        temperature=2.0,
+        top_p=min(random.random() + 0.3, 1),
+        top_k=random.randint(5, 20),
+        n=random.randint(1, 10),
+        presence_penalty=random.randint(0, 1),
+        max_tokens=8,
+        ignore_eos=True,
+    )
+
+    sampling_params_seed_1 = copy.deepcopy(sampling_params)
+    sampling_params_seed_1.seed = 100
+    sampling_params_seed_2 = copy.deepcopy(sampling_params)
+    sampling_params_seed_2.seed = 200
+
+    llm = vllm_model.model
+
+    for prompt in example_prompts:
+        for params in (
+                sampling_params,
+                sampling_params_seed_1,
+                sampling_params_seed_2,
+                sampling_params,
+                sampling_params_seed_1,
+                sampling_params_seed_2,
+        ):
+            llm._add_request(
+                prompt=prompt,
+                prompt_token_ids=None,
+                sampling_params=params,
+            )
+
+    results = llm._run_engine(use_tqdm=False)
+    all_outputs = [[out.token_ids for out in output.outputs]
+                   for output in results]
+
+    for i in range(0, len(example_prompts), 6):
+        outputs = all_outputs[i:i + 6]
+
+        # verify all non-seeded requests differ
+        for output_a, output_b in combinations(
+            (outputs[0], outputs[1], outputs[2], outputs[3]),
+                2,
+        ):
+            assert output_a != output_b
+
+        # verify requests with the same seed match
+        assert outputs[1] == outputs[4]
+        assert outputs[2] == outputs[5]
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 5dde9097a3d57..f4ac2d6dc59fe 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -387,6 +387,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 block_tables=block_tables,
                 lora_request=seq_group.lora_request,
                 prefix=seq_group.prefix,
+                state=seq_group.state,
             )
             seq_group_metadata_list.append(seq_group_metadata)
         return seq_group_metadata_list, scheduler_outputs
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8ac0157151d8e..a4efd171b871d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -173,7 +173,6 @@ def add_cli_args(
                             default=EngineArgs.block_size,
                             choices=[8, 16, 32],
                             help='token block size')
-        # TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
         parser.add_argument('--seed',
                             type=int,
                             default=EngineArgs.seed,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index fc15b7833ecf2..727fec870293c 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -60,6 +60,7 @@ class ChatCompletionRequest(BaseModel):
     top_p: Optional[float] = 1.0
     n: Optional[int] = 1
     max_tokens: Optional[int] = None
+    seed: Optional[int] = None
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
     presence_penalty: Optional[float] = 0.0
@@ -90,6 +91,7 @@ def to_sampling_params(self) -> SamplingParams:
             temperature=self.temperature,
             top_p=self.top_p,
             min_p=self.min_p,
+            seed=self.seed,
             stop=self.stop,
             stop_token_ids=self.stop_token_ids,
             max_tokens=self.max_tokens,
@@ -117,6 +119,7 @@ class CompletionRequest(BaseModel):
     logprobs: Optional[int] = None
     echo: Optional[bool] = False
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
+    seed: Optional[int] = None
     presence_penalty: Optional[float] = 0.0
     frequency_penalty: Optional[float] = 0.0
     best_of: Optional[int] = None
@@ -147,6 +150,7 @@ def to_sampling_params(self):
             top_p=self.top_p,
             top_k=self.top_k,
             min_p=self.min_p,
+            seed=self.seed,
             stop=self.stop,
             stop_token_ids=self.stop_token_ids,
             ignore_eos=self.ignore_eos,
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index bc86a916b5bbf..884d84387e505 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -342,7 +342,9 @@ def _beam_search_sample(
 def _multinomial(
     probs: torch.Tensor,
     num_samples: int,
-):
+    seq_groups: Optional[List[Tuple[List[int], SamplingParams]]] = None,
+    generators: Optional[List[torch.Generator]] = None,
+) -> torch.Tensor:
     if num_samples > 1:
         # This is equivalent to torch.repeat_interleaved (which also
         # forces a GPU<->CPU sync).
@@ -352,7 +354,15 @@ def _multinomial(
         probs = probs[:, None, :].expand(probs.shape[0], num_samples,
                                          probs.shape[1]).contiguous().view(
                                              -1, probs.shape[1])
-    q = torch.empty_like(probs).exponential_(1)
+    q = torch.empty_like(probs)
+    if seq_groups is None:
+        q.exponential_()
+    else:
+        sample_idx = 0
+        for (seq_ids, _), generator in zip(seq_groups, generators):
+            next_sample_idx = sample_idx + len(seq_ids) * num_samples
+            q[sample_idx:next_sample_idx].exponential_(generator=generator)
+            sample_idx = next_sample_idx
     return probs.div_(q).argmax(dim=1).view(-1, num_samples)
 
 
@@ -370,6 +380,7 @@ def _sample(
 
     sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
     sample_metadata = {}
+    multinomial_samples = {}
 
     # Counterintiutively, having two loops here is actually faster.
     # The first loop can run without waiting on GPU<->CPU sync.
@@ -385,14 +396,18 @@ def _sample(
                                           is_prompts, sample_indices)
         if sampling_type == SamplingType.GREEDY:
             greedy_samples = torch.argmax(logprobs[sample_indices], dim=-1)
-        elif sampling_type == SamplingType.RANDOM:
+        elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
             max_best_of = 1
             for seq_group, is_prompt in zip(seq_groups, is_prompts):
                 if is_prompt:
                     _, sampling_params = seq_group
                     max_best_of = max(max_best_of, sampling_params.best_of)
-            multinomial_samples = _multinomial(probs[sample_indices],
-                                               max_best_of)
+            seeded_args = {} if sampling_type == SamplingType.RANDOM else {
+                "seq_groups": seq_groups,
+                "generators": sampling_metadata.generators,
+            }
+            multinomial_samples[sampling_type] = _multinomial(
+                probs[sample_indices], max_best_of, **seeded_args)
         elif sampling_type == SamplingType.BEAM:
             beam_search_logprobs = logprobs[sample_indices]
         else:
@@ -407,9 +422,9 @@ def _sample(
             sampling_type]
         if sampling_type == SamplingType.GREEDY:
             sample_results = _greedy_sample(seq_groups, greedy_samples)
-        elif sampling_type == SamplingType.RANDOM:
+        elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
             sample_results = _random_sample(seq_groups, is_prompts,
-                                            multinomial_samples)
+                                            multinomial_samples[sampling_type])
         elif sampling_type == SamplingType.BEAM:
             sample_results = _beam_search_sample(seq_groups, is_prompts,
                                                  sampling_metadata.seq_data,
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 2d41d40e04678..d0ffeecd2d74d 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -19,6 +19,7 @@ class SamplingMetadata:
         prompt_lens: Lengths of prompts.
         selected_token_indices: Token indices selected for sampling.
         categorized_sample_indices: SamplingType -> token indices to sample.
+        generators: List of torch.Generators to use for seeded sampling
         perform_sampling: Whether to perform sampling. This option is used to
             make the sampling only happens in the driver worker, and disable
             sampling in other worker processes.
@@ -31,6 +32,7 @@ def __init__(
         prompt_lens: Optional[List[int]],
         selected_token_indices: torch.Tensor,
         categorized_sample_indices: Optional[Dict[SamplingType, torch.Tensor]],
+        generators: Optional[List[torch.Generator]] = None,
         perform_sampling: bool = True,
     ) -> None:
         self.seq_groups = seq_groups
@@ -38,6 +40,7 @@ def __init__(
         self.prompt_lens = prompt_lens
         self.selected_token_indices = selected_token_indices
         self.categorized_sample_indices = categorized_sample_indices
+        self.generators = generators
         self.perform_sampling = perform_sampling
 
         self.num_prompts = len(prompt_lens) if prompt_lens is not None else 0
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index bb7d0002c910c..51d39220ca9ca 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -11,7 +11,8 @@
 class SamplingType(IntEnum):
     GREEDY = 0
     RANDOM = 1
-    BEAM = 2
+    RANDOM_SEED = 2
+    BEAM = 3
 
 
 LogitsProcessor = Callable[[List[int], torch.Tensor], torch.Tensor]
@@ -56,6 +57,7 @@ class SamplingParams:
         min_p: Float that represents the minimum probability for a token to be
             considered, relative to the probability of the most likely token.
             Must be in [0, 1]. Set to 0 to disable this.
+        seed: Random seed to use for the generation.
         use_beam_search: Whether to use beam search instead of sampling.
         length_penalty: Float that penalizes sequences based on their length.
             Used in beam search.
@@ -101,6 +103,7 @@ def __init__(
         top_p: float = 1.0,
         top_k: int = -1,
         min_p: float = 0.0,
+        seed: Optional[int] = None,
         use_beam_search: bool = False,
         length_penalty: float = 1.0,
         early_stopping: Union[bool, str] = False,
@@ -124,6 +127,7 @@ def __init__(
         self.top_p = top_p
         self.top_k = top_k
         self.min_p = min_p
+        self.seed = seed
         self.use_beam_search = use_beam_search
         self.length_penalty = length_penalty
         self.early_stopping = early_stopping
@@ -229,6 +233,8 @@ def sampling_type(self) -> SamplingType:
             return SamplingType.BEAM
         if self.temperature < _SAMPLING_EPS:
             return SamplingType.GREEDY
+        if self.seed is not None:
+            return SamplingType.RANDOM_SEED
         return SamplingType.RANDOM
 
     def __repr__(self) -> str:
@@ -242,6 +248,7 @@ def __repr__(self) -> str:
             f"top_p={self.top_p}, "
             f"top_k={self.top_k}, "
             f"min_p={self.min_p}, "
+            f"seed={self.seed}, "
             f"use_beam_search={self.use_beam_search}, "
             f"length_penalty={self.length_penalty}, "
             f"early_stopping={self.early_stopping}, "
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 44adb058a5ba5..040e9756e15c6 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -248,6 +248,14 @@ def __repr__(self) -> str:
                 f"num_blocks={len(self.logical_token_blocks)})")
 
 
+@dataclass
+class SequenceGroupState:
+    """Mutable state tied to a specific sequence group"""
+
+    # torch.Generator used in seeded sampling
+    generator: Optional = None
+
+
 class SequenceGroup:
     """A group of sequences that are generated from the same prompt.
 
@@ -280,6 +288,7 @@ def __init__(
         self.lora_request = lora_request
         self.prefix: Optional[Prefix] = prefix
         self.prompt_logprobs: Optional[PromptLogprobs] = None
+        self.state = SequenceGroupState()
 
     @property
     def prompt(self) -> str:
@@ -397,6 +406,7 @@ class SequenceGroupMetadata:
         sampling_params: The sampling parameters used to generate the outputs.
         block_tables: The block tables. (Seq id -> list of physical block
             numbers)
+        state: Internal state tied to this sequence group.
         lora_request: LoRA request.
         prefix: The prefix of the prompt of the sequence group.
     """
@@ -410,6 +420,7 @@ def __init__(
         block_tables: Dict[int, List[int]],
         lora_request: Optional[LoRARequest] = None,
         prefix: Optional[Prefix] = None,
+        state: Optional[SequenceGroupState] = None,
     ) -> None:
         self.request_id = request_id
         self.is_prompt = is_prompt
@@ -418,6 +429,7 @@ def __init__(
         self.block_tables = block_tables
         self.lora_request = lora_request
         self.prefix = prefix
+        self.state = SequenceGroupState() if state is None else state
 
     @property
     def lora_int_id(self) -> int:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index a27b7d9cb4dcf..b99a409e02d1e 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -389,6 +389,7 @@ def _prepare_sample(
     ) -> SamplingMetadata:
         seq_groups: List[Tuple[List[int], SamplingParams]] = []
         selected_token_indices: List[int] = []
+        generators: List[torch.Generator] = []
         selected_token_start_idx = 0
         categorized_sample_indices = {t: [] for t in SamplingType}
         categorized_sample_indices_start_idx = 0
@@ -419,6 +420,10 @@ def _prepare_sample(
                 selected_token_indices.append(selected_token_start_idx +
                                               subquery_len - 1)
                 selected_token_start_idx += max_subquery_len
+
+                if sampling_params.seed is not None:
+                    seq_group_metadata.state.generator = torch.Generator(
+                        device="cuda").manual_seed(sampling_params.seed)
             else:
                 num_seqs = len(seq_ids)
                 selected_token_indices.extend(
@@ -432,6 +437,9 @@ def _prepare_sample(
                               categorized_sample_indices_start_idx + num_seqs))
                 categorized_sample_indices_start_idx += num_seqs
 
+            if sampling_params.seed is not None:
+                generators.append(seq_group_metadata.state.generator)
+
         selected_token_indices = _async_h2d(selected_token_indices,
                                             dtype=torch.long,
                                             target_device=self.device,
@@ -454,6 +462,7 @@ def _prepare_sample(
             prompt_lens=prompt_lens,
             selected_token_indices=selected_token_indices,
             categorized_sample_indices=categorized_sample_indices,
+            generators=generators,
         )
         return sampling_metadata
 
@@ -536,6 +545,7 @@ def prepare_input_tensors(
                 prompt_lens=None,
                 selected_token_indices=metadata_dict["selected_token_indices"],
                 categorized_sample_indices=None,
+                generators=None,
                 perform_sampling=False,
             )
 

From 8fbd84bf7839d53e6dd26a1dd4473dd1a99aab6e Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Wed, 21 Feb 2024 11:47:25 -0800
Subject: [PATCH 090/112] Bump up version to v0.3.2 (#2968)

This version is for more model support. Add support for Gemma models (#2964) and OLMo models (#2832).
---
 vllm/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/__init__.py b/vllm/__init__.py
index e3234c009c1dc..7ff92d8cc681d 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -8,7 +8,7 @@
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import SamplingParams
 
-__version__ = "0.3.1"
+__version__ = "0.3.2"
 
 __all__ = [
     "LLM",

From 7c4304b7cc9160a0dead5655c3c30692df615160 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+rib-2@users.noreply.github.com>
Date: Thu, 1 Feb 2024 15:27:20 -0800
Subject: [PATCH 091/112] Add sparsity support based with magic_wand GPU
 kernels

---
 README.md                                     |  48 ++++----
 examples/offline_bench.py                     | 111 ++++++++++++++++++
 vllm/config.py                                |  27 +++++
 vllm/engine/arg_utils.py                      |  15 ++-
 vllm/engine/llm_engine.py                     |   1 +
 vllm/entrypoints/llm.py                       |   7 ++
 vllm/model_executor/layers/linear.py          |  36 +++++-
 .../layers/parameters/__init__.py             |  10 ++
 .../layers/parameters/sparsity.py             |  46 ++++++++
 .../layers/sparsity/__init__.py               |  20 ++++
 .../layers/sparsity/base_config.py            |  51 ++++++++
 .../layers/sparsity/sparse_w16a16.py          |  99 ++++++++++++++++
 vllm/model_executor/model_loader.py           |  23 +++-
 vllm/model_executor/weight_utils.py           |  25 +++-
 14 files changed, 489 insertions(+), 30 deletions(-)
 create mode 100644 examples/offline_bench.py
 create mode 100644 vllm/model_executor/layers/parameters/__init__.py
 create mode 100644 vllm/model_executor/layers/parameters/sparsity.py
 create mode 100644 vllm/model_executor/layers/sparsity/__init__.py
 create mode 100644 vllm/model_executor/layers/sparsity/base_config.py
 create mode 100644 vllm/model_executor/layers/sparsity/sparse_w16a16.py

diff --git a/README.md b/README.md
index 7a16bb1fef044..0ab4381a16700 100644
--- a/README.md
+++ b/README.md
@@ -82,29 +82,35 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
 
 ```bash
-pip install vllm
+git clone https://github.com/neuralmagic/magic_wand.git
+cd magic_wand
+export TORCH_CUDA_ARCH_LIST=8.6
+pip install -e .
 ```
 
-## Getting Started
-
-Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
-- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
-- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
-- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
-
-## Contributing
+Install:
+```bash
+cd ../
+pip install -e .
+```
 
-We welcome and value any contributions and collaborations.
-Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
+### Run Sample
 
-## Citation
+Run a 50% sparse model:
 
-If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
-```bibtex
-@inproceedings{kwon2023efficient,
-  title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
-  author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
-  booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
-  year={2023}
-}
-```
+```bash
+from vllm import LLM, SamplingParams
+
+model = LLM(
+    "nm-testing/Llama-2-7b-pruned50-retrained", 
+    sparsity="sparse_w16a16",   # If left off, model will be loaded as dense
+    enforce_eager=True,         # Does not work with cudagraphs yet
+    dtype="float16",
+    tensor_parallel_size=1,
+    max_model_len=1024
+)
+
+sampling_params = SamplingParams(max_tokens=100, temperature=0)
+outputs = model.generate("Hello my name is", sampling_params=sampling_params)
+outputs[0].outputs[0].text
+```
\ No newline at end of file
diff --git a/examples/offline_bench.py b/examples/offline_bench.py
new file mode 100644
index 0000000000000..ae7b391da0c39
--- /dev/null
+++ b/examples/offline_bench.py
@@ -0,0 +1,111 @@
+import random
+import time
+import argparse
+
+from vllm import LLM, SamplingParams
+
+NUM_REQUESTS_DEFAULT = 256
+MAX_SEQ_LEN_DEFAULT = 1024
+MAX_TOKENS_DEFAULT = 128
+SAMPLE_PROMPTS = [
+    # "Hello, my name is",
+    # "The president of the United States is",
+    # "The capital of France is",
+    "The future of AI is",
+]
+
+
+def run_bench(model_name,
+              model_revision,
+              is_sparse,
+              quant_method,
+              max_seq_len,
+              max_tokens,
+              num_requests,
+              num_gpus,
+              num_warmup_iters=1,
+              num_bench_iters=5,
+              possible_prompts=SAMPLE_PROMPTS,
+              enforce_eager=True):
+    print("Run bench with:")
+    print(f"  model_name = {model_name}")
+    print(f"  model_revision = {model_revision}")
+    print(f"  is_sparse = {is_sparse}")
+    print(f"  quant_method = {quant_method}")
+    print(f"  max_seq_len = {max_seq_len}")
+    print(f"  max_tokens = {max_tokens}")
+    print(f"  num_requests = {num_requests}")
+    print(f"  num_gpus = {num_gpus}")
+    print(f"  num_warmup_iters = {num_warmup_iters}")
+    print(f"  num_bench_iters = {num_bench_iters}")
+
+    prompts = []
+    for _ in range(num_requests):
+        index = random.randint(0, len(possible_prompts) - 1)
+        prompts.append(possible_prompts[index])
+
+    # Create sampling params
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=max_tokens)
+
+    # Create LLM
+    llm = LLM(
+        model=model_name,
+        revision=model_revision,
+        sparsity="sparse_w16a16" if is_sparse else None,
+        enforce_eager=enforce_eager,
+        #   dtype=torch.bfloat16,
+        tensor_parallel_size=num_gpus,
+        gpu_memory_utilization=0.9,
+        max_model_len=max_seq_len,
+        quantization=quant_method,
+    )
+
+    for i in range(num_warmup_iters):
+        start_time = time.time()
+        outputs = llm.generate(prompts, sampling_params)
+        elapsed_time = time.time() - start_time
+        print(f"Warmup iter {i} time: {elapsed_time} [secs]")
+
+    iter_times = []
+    for i in range(num_bench_iters):
+        start_time = time.time()
+        outputs = llm.generate(prompts, sampling_params)
+        iter_times.append(time.time() - start_time)
+        print(f"Bench iter {i} time: {iter_times[-1]} [secs]")
+
+    average_iter_time = sum(iter_times) / num_bench_iters
+    print(f"Average per iter time: {average_iter_time} [secs]")
+
+    # Print outputs of the last iter
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    return average_iter_time
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_name", type=str, required=True)
+    parser.add_argument("--model_revision", type=str, default=None)
+    parser.add_argument('--is_sparse', action='store_true')
+    parser.add_argument("--quant_method", type=str, default=None)
+    parser.add_argument("--max_seq_len", type=int, default=MAX_SEQ_LEN_DEFAULT)
+    parser.add_argument("--max_tokens", type=int, default=MAX_TOKENS_DEFAULT)
+    parser.add_argument("--num_requests",
+                        type=int,
+                        default=NUM_REQUESTS_DEFAULT)
+    parser.add_argument("--num_gpus", type=int, default=1)
+    parser.add_argument("--num_warmup_iters", type=int, default=1)
+    parser.add_argument("--num_bench_iters", type=int, default=5)
+
+    args = parser.parse_args()
+
+    run_bench(args.model_name, args.model_revision, args.is_sparse,
+              args.quant_method, args.max_seq_len, args.max_tokens,
+              args.num_requests, args.num_gpus, args.num_warmup_iters,
+              args.num_bench_iters)
diff --git a/vllm/config.py b/vllm/config.py
index 0b8a2a27f6d43..d7ea6fa38addd 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -77,6 +77,7 @@ def __init__(
         tokenizer_revision: Optional[str] = None,
         max_model_len: Optional[int] = None,
         quantization: Optional[str] = None,
+        sparsity: Optional[str] = None,
         enforce_eager: bool = False,
         max_context_len_to_capture: Optional[int] = None,
     ) -> None:
@@ -91,6 +92,7 @@ def __init__(
         self.code_revision = code_revision
         self.tokenizer_revision = tokenizer_revision
         self.quantization = quantization
+        self.sparsity = sparsity
         self.enforce_eager = enforce_eager
         self.max_context_len_to_capture = max_context_len_to_capture
 
@@ -116,6 +118,7 @@ def __init__(
         self._verify_load_format()
         self._verify_tokenizer_mode()
         self._verify_quantization()
+        self._verify_sparsity()
         self._verify_cuda_graph()
 
     def _verify_load_format(self) -> None:
@@ -154,6 +157,30 @@ def _verify_tokenizer_mode(self) -> None:
                 "either 'auto' or 'slow'.")
         self.tokenizer_mode = tokenizer_mode
 
+    def _verify_sparsity(self) -> None:
+        supported_sparsity = ["sparse_w16a16"]
+
+        if self.quantization is not None:
+            raise ValueError("Both sparsity and quantization detected. Only "
+                             "one or the other is supported at a time.")
+
+        if self.sparsity is not None and self.sparsity not in supported_sparsity:
+            raise ValueError(f"Unknown sparse method: {self.sparsity}. Must "
+                             f"be one of {supported_sparsity}.")
+
+        hf_sparsity_config = getattr(self.hf_config, "sparsity_config", None)
+        if hf_sparsity_config is not None:
+            hf_sparsity_method = str(
+                hf_sparsity_config["sparse_method"]).lower()
+            if self.sparsity is None:
+                self.sparsity = hf_sparsity_method
+            elif self.sparsity != hf_sparsity_method:
+                raise ValueError(
+                    "Sparsity method specified in the model config "
+                    f"({hf_sparsity_method}) does not match the sparsity "
+                    f"method specified in the `sparsity` argument "
+                    f"({self.sparsity}).")
+
     def _verify_quantization(self) -> None:
         supported_quantization = ["awq", "gptq", "squeezellm"]
         rocm_not_supported_quantization = ["awq"]
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a4efd171b871d..f1772b2fb7ee7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -35,6 +35,7 @@ class EngineArgs:
     code_revision: Optional[str] = None
     tokenizer_revision: Optional[str] = None
     quantization: Optional[str] = None
+    sparsity: Optional[str] = None
     enforce_eager: bool = False
     max_context_len_to_capture: int = 8192
     disable_custom_all_reduce: bool = False
@@ -216,6 +217,16 @@ def add_cli_args(
                             'None, we assume the model weights are not '
                             'quantized and use `dtype` to determine the data '
                             'type of the weights.')
+        parser.add_argument(
+            '--sparsity',
+            '-s',
+            type=str,
+            choices=['sparse_w16a16', None],
+            default=None,
+            help='Method used to compress sparse weights. If '
+            'None, we first check the `sparsity_config` attribute '
+            'in the model config file. If that is None we assume '
+            'the model weights are dense')
         parser.add_argument('--enforce-eager',
                             action='store_true',
                             help='Always use eager-mode PyTorch. If False, '
@@ -290,8 +301,8 @@ def create_engine_configs(
             self.model, self.tokenizer, self.tokenizer_mode,
             self.trust_remote_code, self.download_dir, self.load_format,
             self.dtype, self.seed, self.revision, self.code_revision,
-            self.tokenizer_revision, self.max_model_len, self.quantization,
-            self.enforce_eager, self.max_context_len_to_capture)
+            self.tokenizer_revision, self.max_model_len, self.sparsity,
+            self.quantization, self.enforce_eager, self.max_context_len_to_capture)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index f0de40f54db61..4210c30eaef7a 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -91,6 +91,7 @@ def __init__(
             f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
             f"disable_custom_all_reduce={parallel_config.disable_custom_all_reduce}, "
             f"quantization={model_config.quantization}, "
+            f"sparsity={model_config.sparsity}, "
             f"enforce_eager={model_config.enforce_eager}, "
             f"kv_cache_dtype={cache_config.cache_dtype}, "
             f"device_config={device_config.device}, "
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index fc82018d18eb6..30e3bf105a1da 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -43,6 +43,11 @@ class LLM:
             the `quantization_config` attribute in the model config file. If
             that is None, we assume the model weights are not quantized and use
             `dtype` to determine the data type of the weights.
+        sparsity: The format of the sparse model weights. Currently,
+            we support "sparse_w16a16". If None, we first check the `sparsity` 
+            attribute in the model config file. If that is None, we assume the 
+            model weights are dense and use `dtype` to determine the data 
+            type of the weights.
         revision: The specific model version to use. It can be a branch name,
             a tag name, or a commit id.
         tokenizer_revision: The specific tokenizer version to use. It can be a
@@ -76,6 +81,7 @@ def __init__(
         tensor_parallel_size: int = 1,
         dtype: str = "auto",
         quantization: Optional[str] = None,
+        sparsity: Optional[str] = None,
         revision: Optional[str] = None,
         tokenizer_revision: Optional[str] = None,
         seed: int = 0,
@@ -96,6 +102,7 @@ def __init__(
             tensor_parallel_size=tensor_parallel_size,
             dtype=dtype,
             quantization=quantization,
+            sparsity=sparsity,
             revision=revision,
             tokenizer_revision=tokenizer_revision,
             seed=seed,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 55d38b763b2b5..5f4e47a05403f 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -13,6 +13,7 @@
     divide, split_tensor_along_last_dim)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.logger import init_logger
+from vllm.model_executor.layers.parameters import SparseParameter, get_param_data
 
 logger = init_logger(__name__)
 
@@ -191,7 +192,8 @@ def __init__(
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         tp_rank = get_tensor_model_parallel_rank()
         output_dim = getattr(param, "output_dim", None)
-        param_data = param.data
+        param_data = get_param_data(param)
+
         if output_dim is not None:
             shard_size = param_data.shape[output_dim]
             start_idx = tp_rank * shard_size
@@ -200,6 +202,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
+        # If SparseParameter, repack dense data as sparse.
+        if isinstance(param, SparseParameter):
+            param.pack()
+
     def forward(self, input_):
         bias = self.bias if not self.skip_bias_add else None
 
@@ -256,9 +262,14 @@ def weight_loader(self,
                       param: Parameter,
                       loaded_weight: torch.Tensor,
                       loaded_shard_id: Optional[int] = None):
-        param_data = param.data
+        param_data = get_param_data(param)
         output_dim = getattr(param, "output_dim", None)
         if loaded_shard_id is None:
+            if isinstance(param, SparseParameter):
+                raise NotImplementedError(
+                    "Passing loaded_shard_id=None not yet supported for SparseParameter"
+                )
+
             # Loaded weight is already packed.
             if output_dim is None:
                 assert param_data.shape == loaded_weight.shape
@@ -308,6 +319,10 @@ def weight_loader(self,
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
+        # If Parameter, repack dense data as sparse.
+        if isinstance(param, SparseParameter):
+            param.pack()
+
 
 class QKVParallelLinear(ColumnParallelLinear):
     """Linear layers for the attention's QKV transformation.
@@ -370,9 +385,14 @@ def weight_loader(self,
                       param: Parameter,
                       loaded_weight: torch.Tensor,
                       loaded_shard_id: Optional[str] = None):
-        param_data = param.data
+        param_data = get_param_data(param)
         output_dim = getattr(param, "output_dim", None)
         if loaded_shard_id is None:
+            if isinstance(param, SparseParameter):
+                raise NotImplementedError(
+                    "Passing loaded_shard_id=None not yet supported for SparseParameter"
+                )
+
             # Loaded weight is already packed.
             if output_dim is None:
                 assert param_data.shape == loaded_weight.shape
@@ -436,6 +456,10 @@ def weight_loader(self,
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
+        # If SparseParameter, repack dense data as sparse.
+        if isinstance(param, SparseParameter):
+            param.pack()
+
 
 class RowParallelLinear(torch.nn.Module):
     """Linear layer with row parallelism.
@@ -516,7 +540,7 @@ def __init__(
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         tp_rank = get_tensor_model_parallel_rank()
         input_dim = getattr(param, "input_dim", None)
-        param_data = param.data
+        param_data = get_param_data(param)
         if input_dim is not None:
             shard_size = param_data.shape[input_dim]
             start_idx = tp_rank * shard_size
@@ -525,6 +549,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
+        # If SparseParameter, repack dense data as sparse.
+        if isinstance(param, SparseParameter):
+            param.pack()
+
     def forward(self, input_):
         # Set up backprop all-reduce.
         if self.input_is_parallel:
diff --git a/vllm/model_executor/layers/parameters/__init__.py b/vllm/model_executor/layers/parameters/__init__.py
new file mode 100644
index 0000000000000..2d41190087a0d
--- /dev/null
+++ b/vllm/model_executor/layers/parameters/__init__.py
@@ -0,0 +1,10 @@
+import torch
+from vllm.model_executor.layers.parameters.sparsity import SparseParameter
+
+
+def get_param_data(param: torch.nn.Parameter) -> torch.Tensor:
+    """Gets parameter data in dense format."""
+    if isinstance(param, SparseParameter):
+        return param.get_dense_data()
+    else:
+        return param.data
diff --git a/vllm/model_executor/layers/parameters/sparsity.py b/vllm/model_executor/layers/parameters/sparsity.py
new file mode 100644
index 0000000000000..37ddd05d89636
--- /dev/null
+++ b/vllm/model_executor/layers/parameters/sparsity.py
@@ -0,0 +1,46 @@
+import torch
+
+from magic_wand import SparseTensor, SparseBitmaskStorageFormat
+
+
+class SparseParameter(SparseTensor):
+
+    @staticmethod
+    def __new__(
+        cls,
+        shape: torch.Size,
+        dtype: torch.dtype,
+    ):
+        assert torch.__version__ > (1,
+                                    10), "SparseTensor requires PyTorch 1.11+"
+        self = torch.Tensor._make_wrapper_subclass(cls,
+                                                   size=shape,
+                                                   dtype=dtype,
+                                                   requires_grad=False)
+        self.storage_format_cls = SparseBitmaskStorageFormat
+        self.compressed_data = None
+        self.dense_data = None
+        self._is_param = True
+
+        return self
+
+    def get_dense_data(self) -> torch.Tensor:
+        if self.dense_data is not None:
+            raise ValueError(
+                "Called get_data_dense() but dense_data already exists.")
+        self.dense_data = self._unpack()
+        return self.dense_data
+
+    def _unpack(self) -> torch.Tensor:
+        if self.has_compressed_data():
+            return self.compressed_data.decompress()
+        else:
+            return torch.empty(size=self.shape,
+                               dtype=self.dtype,
+                               device="cuda")
+
+    def pack(self) -> None:
+        if self.dense_data is None:
+            raise ValueError("Called pack() but dense_data does not exist.")
+        self.copy_(self.dense_data)
+        self.dense_data = None
diff --git a/vllm/model_executor/layers/sparsity/__init__.py b/vllm/model_executor/layers/sparsity/__init__.py
new file mode 100644
index 0000000000000..411d1ff642266
--- /dev/null
+++ b/vllm/model_executor/layers/sparsity/__init__.py
@@ -0,0 +1,20 @@
+from typing import Type
+
+from vllm.model_executor.layers.sparsity.base_config import SparsityConfig
+from vllm.model_executor.layers.sparsity.sparse_w16a16 import SparseW16A16Config
+
+_SPARSITY_CONFIG_REGISTRY = {
+    "sparse_w16a16": SparseW16A16Config,
+}
+
+
+def get_sparsity_config(sparsity: str) -> Type[SparsityConfig]:
+    if sparsity not in _SPARSITY_CONFIG_REGISTRY:
+        raise ValueError(f"Invalid sparsity method: {sparsity}")
+    return _SPARSITY_CONFIG_REGISTRY[sparsity]
+
+
+__all__ = [
+    "SparsityConfig",
+    "get_sparsity_config",
+]
diff --git a/vllm/model_executor/layers/sparsity/base_config.py b/vllm/model_executor/layers/sparsity/base_config.py
new file mode 100644
index 0000000000000..aa09fb623bc00
--- /dev/null
+++ b/vllm/model_executor/layers/sparsity/base_config.py
@@ -0,0 +1,51 @@
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List
+
+import torch
+
+from vllm.model_executor.layers.linear import LinearMethodBase
+
+
+class SparsityConfig(ABC):
+    """Base class for sparsity configs."""
+
+    @abstractmethod
+    def get_name(self) -> str:
+        """Name of the sparse method."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_supported_act_dtypes(self) -> List[torch.dtype]:
+        """List of supported act_dtypes."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_min_capability(self) -> int:
+        """Minimum GPU capability to support the sparsity method."""
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_config_filenames() -> List[str]:
+        """List of filenames to search for in the model directory."""
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def from_config(cls, config: Dict[str, Any]) -> "SparsityConfig":
+        """Create a config class from the model's sparse config."""
+        raise NotImplementedError
+
+    @staticmethod
+    def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any:
+        """Get a value from the model's sparsity config."""
+        for key in keys:
+            if key in config:
+                return config[key]
+        raise ValueError(f"Cannot find any of {keys} in the model's "
+                         "sparsity config.")
+
+    @abstractmethod
+    def get_linear_method(self) -> LinearMethodBase:
+        """Get the linear method to use for the sparse linear layer."""
+        raise NotImplementedError
diff --git a/vllm/model_executor/layers/sparsity/sparse_w16a16.py b/vllm/model_executor/layers/sparsity/sparse_w16a16.py
new file mode 100644
index 0000000000000..771fae9b8ff45
--- /dev/null
+++ b/vllm/model_executor/layers/sparsity/sparse_w16a16.py
@@ -0,0 +1,99 @@
+from typing import Any, Dict, List, Optional
+
+import torch
+import torch.nn.functional as F
+
+from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
+from vllm.model_executor.layers.sparsity.base_config import SparsityConfig
+from vllm.model_executor.layers.parameters import SparseParameter
+
+
+class SparseW16A16Config(SparsityConfig):
+    """Config class for SparseW16A16.
+
+    TODO: Add based on need
+    """
+
+    def __init__(self) -> None:
+        # TODO: Add new configs here
+        pass
+
+    def __repr__(self) -> str:
+        return "SparseW16A16Config()"
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "sparse_w16a16"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # TODO: Update after checks on more GPUs
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["sparsity_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "SparseW16A16Config":
+        return cls()
+
+    def get_linear_method(self) -> "SparseW16A16LinearMethod":
+        return SparseW16A16LinearMethod(self)
+
+
+class SparseW16A16LinearMethod(LinearMethodBase):
+    """Linear method for Sparse W16A16.
+
+    Args:
+        sparsity_config: The sparse config.
+    """
+
+    def __init__(self, sparsity_config: SparseW16A16Config):
+        self.sparsity_config = sparsity_config
+
+    def create_weights(
+        self,
+        input_size_per_partition: int,
+        output_size_per_partition: int,
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
+        weight = SparseParameter(
+            shape=torch.Size(
+                (output_size_per_partition, input_size_per_partition)),
+            dtype=params_dtype,
+        )
+
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+
+        return {"weight": weight}
+
+    def apply_weights(
+        self,
+        weights: Dict[str, Any],
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        sparse_weight = weights["weight"]
+
+        # Uncompress to dense
+        dense_weight = sparse_weight.to_dense()
+
+        # # Uncomment to verify sparsity
+        # density = torch.count_nonzero(
+        #     dense_weight).item() / dense_weight.numel()
+        # print(f"sparsity = {1.0 - density}")
+
+        # Standard matrix multiply
+        if bias is not None:
+            output = F.linear(x, dense_weight, bias)
+        else:
+            output = F.linear(x, dense_weight)
+
+        return output
diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
index ebe092b5d62ba..fd1757e1f97cf 100644
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@@ -8,6 +8,7 @@
 from vllm.config import DeviceConfig, ModelConfig, LoRAConfig
 from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.weight_utils import (get_quant_config,
+                                              get_sparse_config,
                                               initialize_dummy_weights)
 
 
@@ -42,7 +43,7 @@ def get_model(model_config: ModelConfig,
               lora_config: Optional[LoRAConfig] = None) -> nn.Module:
     model_class = _get_model_architecture(model_config)
 
-    # Get the (maybe quantized) linear method.
+    # Get the (maybe sparse or quantized) linear method.
     linear_method = None
     if model_config.quantization is not None:
         quant_config = get_quant_config(model_config)
@@ -61,6 +62,26 @@ def get_model(model_config: ModelConfig,
                 f"method {model_config.quantization}. Supported dtypes: "
                 f"{supported_dtypes}")
         linear_method = quant_config.get_linear_method()
+    if model_config.sparsity is not None:
+        sparse_config = get_sparse_config(model_config.sparsity,
+                                          model_config.model,
+                                          model_config.hf_config,
+                                          model_config.download_dir)
+        capability = torch.cuda.get_device_capability()
+        capability = capability[0] * 10 + capability[1]
+        if capability < sparse_config.get_min_capability():
+            raise ValueError(
+                f"The sparsity method {model_config.sparsity} is not "
+                "supported for the current GPU. "
+                f"Minimum capability: {sparse_config.get_min_capability()}. "
+                f"Current capability: {capability}.")
+        supported_dtypes = sparse_config.get_supported_act_dtypes()
+        if model_config.dtype not in supported_dtypes:
+            raise ValueError(
+                f"{model_config.dtype} is not supported for sparsity "
+                f"method {model_config.sparsity}. Supported dtypes: "
+                f"{supported_dtypes}")
+        linear_method = sparse_config.get_linear_method()
 
     with _set_default_torch_dtype(model_config.dtype):
         # Create a model instance.
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index 3570366887e78..f29b70ac26051 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -17,6 +17,10 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (get_quantization_config,
                                                      QuantizationConfig)
+from vllm.model_executor.layers.sparsity import (get_sparsity_config,
+                                                 SparsityConfig)
+from vllm.model_executor.layers.parameters import (get_param_data,
+                                                   SparseParameter)
 
 logger = init_logger(__name__)
 
@@ -82,6 +86,21 @@ def convert_bin_to_safetensor_file(
             raise RuntimeError(f"The output tensors do not match for key {k}")
 
 
+# TODO(rib-2): Once we define hf_sparsity_config
+def get_sparse_config(
+    sparsity: str,
+    model_name_or_path: str,
+    hf_config: PretrainedConfig,
+    cache_dir: Optional[str] = None,
+) -> SparsityConfig:
+    sparsity_cls = get_sparsity_config(sparsity)
+    hf_sparsity_config = getattr(hf_config, "sparsity_config", None)
+    if hf_sparsity_config is not None:
+        raise NotImplementedError(
+            "Loading hf sparsity config not yet supported")
+    return sparsity_cls()
+
+
 # TODO(woosuk): Move this to other place.
 def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:
     quant_cls = get_quantization_config(model_config.quantization)
@@ -276,11 +295,13 @@ def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
     return x
 
 
-def default_weight_loader(param: torch.Tensor,
+def default_weight_loader(param: torch.nn.Parameter,
                           loaded_weight: torch.Tensor) -> None:
     """Default weight loader."""
     assert param.size() == loaded_weight.size()
-    param.data.copy_(loaded_weight)
+    get_param_data(param).copy_(loaded_weight)
+    if isinstance(param, SparseParameter):
+        param.pack()
 
 
 def initialize_dummy_weights(

From 5344a013b2f45e86877a3655418e919f6ab81bcf Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 2 Feb 2024 07:53:53 -0700
Subject: [PATCH 092/112] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0ab4381a16700..94bfd58792426 100644
--- a/README.md
+++ b/README.md
@@ -98,7 +98,7 @@ pip install -e .
 
 Run a 50% sparse model:
 
-```bash
+```python
 from vllm import LLM, SamplingParams
 
 model = LLM(
@@ -113,4 +113,4 @@ model = LLM(
 sampling_params = SamplingParams(max_tokens=100, temperature=0)
 outputs = model.generate("Hello my name is", sampling_params=sampling_params)
 outputs[0].outputs[0].text
-```
\ No newline at end of file
+```

From 81dba477ebb268dfd710d6d2bd28b32f09a652a0 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 1 Feb 2024 23:41:01 -0500
Subject: [PATCH 093/112] Semi-structured 2:4 sparsity via
 SparseSemiStructuredTensor #4

magic_wand semi_structured_sparse_tensor_linear branch integrates 2:4 semi-structured sparsity into SparseTensor. This PR adds a new sparsity config for 2:4 sparsity to neuralmagic-vllm, using the SparseTensor 2:4 support.

This PR also refactors the sparse linear method into a separate file, vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py, which supports all sparsity formats.
---
 ...ffline_inference_semi_structured_sparse.py | 12 ++++
 vllm/config.py                                |  2 +-
 .../layers/parameters/sparsity.py             | 34 ++++++++--
 .../layers/sparsity/__init__.py               |  2 +
 .../layers/sparsity/base_config.py            |  7 ++
 .../sparsity/semi_structured_sparse_w16a16.py | 46 +++++++++++++
 .../layers/sparsity/sparse_w16a16.py          | 67 +++----------------
 .../sparsity/sparse_w16a16_linear_method.py   | 55 +++++++++++++++
 8 files changed, 159 insertions(+), 66 deletions(-)
 create mode 100644 examples/offline_inference_semi_structured_sparse.py
 create mode 100644 vllm/model_executor/layers/sparsity/semi_structured_sparse_w16a16.py
 create mode 100644 vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py

diff --git a/examples/offline_inference_semi_structured_sparse.py b/examples/offline_inference_semi_structured_sparse.py
new file mode 100644
index 0000000000000..118725b4448d4
--- /dev/null
+++ b/examples/offline_inference_semi_structured_sparse.py
@@ -0,0 +1,12 @@
+from vllm import LLM, SamplingParams
+
+model = LLM("nm-testing/zephyr-50sparse-24",
+            sparsity="semi_structured_sparse_w16a16",
+            enforce_eager=True,
+            dtype="float16",
+            tensor_parallel_size=1,
+            max_model_len=1024)
+
+sampling_params = SamplingParams(max_tokens=100, temperature=0)
+outputs = model.generate("Hello my name is", sampling_params=sampling_params)
+print(outputs[0].outputs[0].text)
diff --git a/vllm/config.py b/vllm/config.py
index d7ea6fa38addd..b10817c93d69f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -158,7 +158,7 @@ def _verify_tokenizer_mode(self) -> None:
         self.tokenizer_mode = tokenizer_mode
 
     def _verify_sparsity(self) -> None:
-        supported_sparsity = ["sparse_w16a16"]
+        supported_sparsity = ["sparse_w16a16", "semi_structured_sparse_w16a16"]
 
         if self.quantization is not None:
             raise ValueError("Both sparsity and quantization detected. Only "
diff --git a/vllm/model_executor/layers/parameters/sparsity.py b/vllm/model_executor/layers/parameters/sparsity.py
index 37ddd05d89636..017fb6b825965 100644
--- a/vllm/model_executor/layers/parameters/sparsity.py
+++ b/vllm/model_executor/layers/parameters/sparsity.py
@@ -1,29 +1,35 @@
 import torch
 
-from magic_wand import SparseTensor, SparseBitmaskStorageFormat
+from typing import Type
+from magic_wand import (SparseTensor, CompressedStorageFormat,
+                        SparseBitmaskStorageFormat)
 
 
 class SparseParameter(SparseTensor):
 
     @staticmethod
-    def __new__(
-        cls,
-        shape: torch.Size,
-        dtype: torch.dtype,
-    ):
+    def __new__(cls,
+                shape: torch.Size,
+                dtype: torch.dtype,
+                storage_format_cls: Type[
+                    CompressedStorageFormat] = SparseBitmaskStorageFormat):
         assert torch.__version__ > (1,
                                     10), "SparseTensor requires PyTorch 1.11+"
+
         self = torch.Tensor._make_wrapper_subclass(cls,
                                                    size=shape,
                                                    dtype=dtype,
                                                    requires_grad=False)
-        self.storage_format_cls = SparseBitmaskStorageFormat
+        self.storage_format_cls = storage_format_cls
         self.compressed_data = None
         self.dense_data = None
         self._is_param = True
 
         return self
 
+    def has_compressed_data(self) -> bool:
+        return (self.compressed_data is not None)
+
     def get_dense_data(self) -> torch.Tensor:
         if self.dense_data is not None:
             raise ValueError(
@@ -39,6 +45,20 @@ def _unpack(self) -> torch.Tensor:
                                dtype=self.dtype,
                                device="cuda")
 
+    @classmethod
+    def _copy(cls, arg0, arg1):
+        assert arg0.shape == arg1.shape
+
+        if arg0.has_compressed_data():
+            arg0.compressed_data.copy_(arg1)
+        else:
+            arg0.compressed_data = arg0.storage_format_cls.compress(arg1)
+
+        return arg0
+
+    def copy_(self, src, non_blocking=False):
+        return SparseParameter._copy(self, src)
+
     def pack(self) -> None:
         if self.dense_data is None:
             raise ValueError("Called pack() but dense_data does not exist.")
diff --git a/vllm/model_executor/layers/sparsity/__init__.py b/vllm/model_executor/layers/sparsity/__init__.py
index 411d1ff642266..82893916fde80 100644
--- a/vllm/model_executor/layers/sparsity/__init__.py
+++ b/vllm/model_executor/layers/sparsity/__init__.py
@@ -2,9 +2,11 @@
 
 from vllm.model_executor.layers.sparsity.base_config import SparsityConfig
 from vllm.model_executor.layers.sparsity.sparse_w16a16 import SparseW16A16Config
+from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import SemiStructuredSparseW16A16Config
 
 _SPARSITY_CONFIG_REGISTRY = {
     "sparse_w16a16": SparseW16A16Config,
+    "semi_structured_sparse_w16a16": SemiStructuredSparseW16A16Config,
 }
 
 
diff --git a/vllm/model_executor/layers/sparsity/base_config.py b/vllm/model_executor/layers/sparsity/base_config.py
index aa09fb623bc00..fe46b55cbf39f 100644
--- a/vllm/model_executor/layers/sparsity/base_config.py
+++ b/vllm/model_executor/layers/sparsity/base_config.py
@@ -2,13 +2,20 @@
 from typing import Any, Dict, List
 
 import torch
+from typing import Type
 
 from vllm.model_executor.layers.linear import LinearMethodBase
+from magic_wand import CompressedStorageFormat
 
 
 class SparsityConfig(ABC):
     """Base class for sparsity configs."""
 
+    @abstractmethod
+    def get_storage_format_cls(self) -> Type[CompressedStorageFormat]:
+        """Sparse representation format"""
+        raise NotImplementedError
+
     @abstractmethod
     def get_name(self) -> str:
         """Name of the sparse method."""
diff --git a/vllm/model_executor/layers/sparsity/semi_structured_sparse_w16a16.py b/vllm/model_executor/layers/sparsity/semi_structured_sparse_w16a16.py
new file mode 100644
index 0000000000000..2cdd34fd0ff1c
--- /dev/null
+++ b/vllm/model_executor/layers/sparsity/semi_structured_sparse_w16a16.py
@@ -0,0 +1,46 @@
+import torch
+
+from typing import Any, Dict, List, Type
+from vllm.model_executor.layers.sparsity.base_config import SparsityConfig
+from .sparse_w16a16_linear_method import SparseW16A16LinearMethod
+from magic_wand import (CompressedStorageFormat,
+                        SparseSemiStructuredStorageFormat)
+
+
+class SemiStructuredSparseW16A16Config(SparsityConfig):
+    """Config class for SemiStructuredSparseW16A16."""
+
+    def __init__(self) -> None:
+        pass
+
+    def __repr__(self) -> str:
+        return "SemiStructuredSparseW16A16Config()"
+
+    @classmethod
+    def get_storage_format_cls(cls) -> Type[CompressedStorageFormat]:
+        return SparseSemiStructuredStorageFormat
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "semi_structured_sparse_w16a16"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # TODO: Update after checks on more GPUs
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["sparsity_config.json"]
+
+    @classmethod
+    def from_config(
+            cls, config: Dict[str, Any]) -> "SemiStructuredSparseW16A16Config":
+        return cls()
+
+    def get_linear_method(self) -> "SparseW16A16LinearMethod":
+        return SparseW16A16LinearMethod(self, self.get_storage_format_cls())
diff --git a/vllm/model_executor/layers/sparsity/sparse_w16a16.py b/vllm/model_executor/layers/sparsity/sparse_w16a16.py
index 771fae9b8ff45..69905eab0c0af 100644
--- a/vllm/model_executor/layers/sparsity/sparse_w16a16.py
+++ b/vllm/model_executor/layers/sparsity/sparse_w16a16.py
@@ -1,11 +1,11 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Type
 
 import torch
-import torch.nn.functional as F
 
-from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
 from vllm.model_executor.layers.sparsity.base_config import SparsityConfig
-from vllm.model_executor.layers.parameters import SparseParameter
+
+from .sparse_w16a16_linear_method import SparseW16A16LinearMethod
+from magic_wand import (CompressedStorageFormat, SparseBitmaskStorageFormat)
 
 
 class SparseW16A16Config(SparsityConfig):
@@ -21,6 +21,10 @@ def __init__(self) -> None:
     def __repr__(self) -> str:
         return "SparseW16A16Config()"
 
+    @classmethod
+    def get_storage_format_cls(cls) -> Type[CompressedStorageFormat]:
+        return SparseBitmaskStorageFormat
+
     @classmethod
     def get_name(cls) -> str:
         return "sparse_w16a16"
@@ -43,57 +47,4 @@ def from_config(cls, config: Dict[str, Any]) -> "SparseW16A16Config":
         return cls()
 
     def get_linear_method(self) -> "SparseW16A16LinearMethod":
-        return SparseW16A16LinearMethod(self)
-
-
-class SparseW16A16LinearMethod(LinearMethodBase):
-    """Linear method for Sparse W16A16.
-
-    Args:
-        sparsity_config: The sparse config.
-    """
-
-    def __init__(self, sparsity_config: SparseW16A16Config):
-        self.sparsity_config = sparsity_config
-
-    def create_weights(
-        self,
-        input_size_per_partition: int,
-        output_size_per_partition: int,
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-    ) -> Dict[str, Any]:
-        weight = SparseParameter(
-            shape=torch.Size(
-                (output_size_per_partition, input_size_per_partition)),
-            dtype=params_dtype,
-        )
-
-        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
-
-        return {"weight": weight}
-
-    def apply_weights(
-        self,
-        weights: Dict[str, Any],
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        sparse_weight = weights["weight"]
-
-        # Uncompress to dense
-        dense_weight = sparse_weight.to_dense()
-
-        # # Uncomment to verify sparsity
-        # density = torch.count_nonzero(
-        #     dense_weight).item() / dense_weight.numel()
-        # print(f"sparsity = {1.0 - density}")
-
-        # Standard matrix multiply
-        if bias is not None:
-            output = F.linear(x, dense_weight, bias)
-        else:
-            output = F.linear(x, dense_weight)
-
-        return output
+        return SparseW16A16LinearMethod(self, self.get_storage_format_cls())
diff --git a/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py b/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
new file mode 100644
index 0000000000000..e2fecda663b60
--- /dev/null
+++ b/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
@@ -0,0 +1,55 @@
+from typing import Any, Dict, Optional, Type
+
+import torch
+import torch.nn.functional as F
+
+from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
+from vllm.model_executor.layers.sparsity.base_config import SparsityConfig
+from vllm.model_executor.layers.parameters import SparseParameter
+from magic_wand import (CompressedStorageFormat,
+                        SparseSemiStructuredStorageFormat)
+
+
+class SparseW16A16LinearMethod(LinearMethodBase):
+    """Linear method for Sparse W16A16.
+
+    Args:
+        sparsity_config: The sparse config.
+    """
+    storage_format_cls: Type[CompressedStorageFormat] = None
+
+    def __init__(self, sparsity_config: SparsityConfig,
+                 storage_format_cls: Type[CompressedStorageFormat]):
+        self.sparsity_config = sparsity_config
+        self.storage_format_cls = storage_format_cls
+
+    def create_weights(self, input_size_per_partition: int,
+                       output_size_per_partition: int, input_size: int,
+                       output_size: int,
+                       params_dtype: torch.dtype) -> Dict[str, Any]:
+        weight = SparseParameter(shape=torch.Size(
+            (output_size_per_partition, input_size_per_partition)),
+                                 dtype=params_dtype,
+                                 storage_format_cls=self.storage_format_cls)
+
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+
+        return {"weight": weight}
+
+    def apply_weights(
+        self,
+        weights: Dict[str, Any],
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        sparse_weight = weights["weight"]
+
+        if self.storage_format_cls == SparseSemiStructuredStorageFormat:
+            output = F.linear(x, sparse_weight, bias)
+            return output
+        else:
+
+            # Standard matrix multiply
+            # Uncompress to dense
+            output = F.linear(x, sparse_weight.to_dense(), bias)
+            return output

From cf8eed72d61db9829e26a5677dd936cba4a5898e Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 14 Feb 2024 17:53:05 -0500
Subject: [PATCH 094/112] Sparse fused gemm integration (#12)

Summary:

Initial integration for the sparse-fused gemm. To achieve this, we need
to ensure that we compress the weight matrix only once and never
decompress it, as decompression is currently unsupported.

Before this change, using `SparseParameter(SparseTensor)` meant that in
`MergedColumnParallelLinear` and `QKVParallelLinear` every time a new
shard was loaded by the `weight_loader` (e.g., the "q" portion of
`QKVParallelLinear`), we would decompress the tensor in-order to use
narrow to update the appropriate section of the weight tensor. With this
change, `SparseParameter(SparseTensor)` is replaced with
`LazyCompressedParameter`, which allows us to operate on
`uncompressed_data` until we explicitly compress it. At that point, the
`uncompressed_data` is compressed into `compressed_data` and freed.
Currently, the detection of when to call compress is somewhat hacky. For
`QKVParallelLinear`, we compress only after inserting "q", "k", and "v"
shard ids, and for `MergedColumnParallelLinear`, we compress once we've
inserted the same number of shards as outputs (determined by
`len(output_sizes)`), which implicitly assumes one shard per output.

Moving away from `SparseParameter(SparseTensor)` means that
`SparseTensor` no longer handles dispatching to the custom ops; instead,
this is handled by `SparseW16A16LinearMethod`. I believe this is a
positive change overall. `SparseTensor` was an unnecessary extra layer
of abstraction/indirection originally designed for the SLoRA work, not
vLLM.

This did result in the 2:4 sparse implementation breaking. However, it
turns out it was already broken (i.e., it was decompressing and running
dense within `SparseTensor`), so we "disable" it for now ("disable"
meaning decompress and run dense instead).

We should revisit all of this infrastructure post-MVP.

---------

Co-authored-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/layers/linear.py          | 54 ++++++-------
 .../layers/parameters/__init__.py             | 13 +---
 .../layers/parameters/lazy_compressed.py      | 78 +++++++++++++++++++
 .../layers/parameters/sparsity.py             | 66 ----------------
 .../layers/sparsity/sparse_w16a16.py          |  4 +-
 .../sparsity/sparse_w16a16_linear_method.py   | 47 +++++++----
 vllm/model_executor/weight_utils.py           |  9 +--
 7 files changed, 148 insertions(+), 123 deletions(-)
 create mode 100644 vllm/model_executor/layers/parameters/lazy_compressed.py
 delete mode 100644 vllm/model_executor/layers/parameters/sparsity.py

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 5f4e47a05403f..49e05922443d2 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -13,7 +13,7 @@
     divide, split_tensor_along_last_dim)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.logger import init_logger
-from vllm.model_executor.layers.parameters import SparseParameter, get_param_data
+from vllm.model_executor.layers.parameters import LazyCompressedParameter
 
 logger = init_logger(__name__)
 
@@ -192,7 +192,7 @@ def __init__(
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         tp_rank = get_tensor_model_parallel_rank()
         output_dim = getattr(param, "output_dim", None)
-        param_data = get_param_data(param)
+        param_data = param.data
 
         if output_dim is not None:
             shard_size = param_data.shape[output_dim]
@@ -202,9 +202,8 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
-        # If SparseParameter, repack dense data as sparse.
-        if isinstance(param, SparseParameter):
-            param.pack()
+        if isinstance(param, LazyCompressedParameter):
+            param.compress()
 
     def forward(self, input_):
         bias = self.bias if not self.skip_bias_add else None
@@ -253,6 +252,7 @@ def __init__(
         linear_method: Optional[LinearMethodBase] = None,
     ):
         self.output_sizes = output_sizes
+        self.loaded_shards = set()
         tp_size = get_tensor_model_parallel_world_size()
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
         super().__init__(input_size, sum(output_sizes), bias, gather_output,
@@ -262,14 +262,9 @@ def weight_loader(self,
                       param: Parameter,
                       loaded_weight: torch.Tensor,
                       loaded_shard_id: Optional[int] = None):
-        param_data = get_param_data(param)
+        param_data = param.data
         output_dim = getattr(param, "output_dim", None)
         if loaded_shard_id is None:
-            if isinstance(param, SparseParameter):
-                raise NotImplementedError(
-                    "Passing loaded_shard_id=None not yet supported for SparseParameter"
-                )
-
             # Loaded weight is already packed.
             if output_dim is None:
                 assert param_data.shape == loaded_weight.shape
@@ -316,12 +311,17 @@ def weight_loader(self,
                     "Loading a weight without `output_dim` attribute in "
                     "MergedColumnParallelLinear, assume the weight is "
                     "the same for all partitions.")
+
+        self.loaded_shards.add(loaded_shard_id)
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
-        # If Parameter, repack dense data as sparse.
-        if isinstance(param, SparseParameter):
-            param.pack()
+        # This is super hacky for now but we basically want to only compress once all
+        # of the shards are loaded, right now we just check if the number of shards
+        # loaded matches the number of outputs expected, assuming one shard per output
+        all_shards_loaded = (len(self.loaded_shards) == len(self.output_sizes))
+        if all_shards_loaded and isinstance(param, LazyCompressedParameter):
+            param.compress()
 
 
 class QKVParallelLinear(ColumnParallelLinear):
@@ -365,6 +365,7 @@ def __init__(
         if total_num_kv_heads is None:
             total_num_kv_heads = total_num_heads
         self.total_num_kv_heads = total_num_kv_heads
+        self.loaded_shards = set()
         # Divide the weight matrix along the last dimension.
         tp_size = get_tensor_model_parallel_world_size()
         self.num_heads = divide(self.total_num_heads, tp_size)
@@ -385,14 +386,9 @@ def weight_loader(self,
                       param: Parameter,
                       loaded_weight: torch.Tensor,
                       loaded_shard_id: Optional[str] = None):
-        param_data = get_param_data(param)
+        param_data = param.data
         output_dim = getattr(param, "output_dim", None)
         if loaded_shard_id is None:
-            if isinstance(param, SparseParameter):
-                raise NotImplementedError(
-                    "Passing loaded_shard_id=None not yet supported for SparseParameter"
-                )
-
             # Loaded weight is already packed.
             if output_dim is None:
                 assert param_data.shape == loaded_weight.shape
@@ -456,9 +452,14 @@ def weight_loader(self,
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
-        # If SparseParameter, repack dense data as sparse.
-        if isinstance(param, SparseParameter):
-            param.pack()
+        self.loaded_shards.add(loaded_shard_id)
+
+        # This is super hacky for now but we basically want to only compress once
+        # all of the shards are loaded, for the QKV matrix this means
+        # loading shards "q", "k" and "v"
+        all_shards_loaded = (self.loaded_shards == set(["q", "k", "v"]))
+        if all_shards_loaded and isinstance(param, LazyCompressedParameter):
+            param.compress()
 
 
 class RowParallelLinear(torch.nn.Module):
@@ -540,7 +541,7 @@ def __init__(
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         tp_rank = get_tensor_model_parallel_rank()
         input_dim = getattr(param, "input_dim", None)
-        param_data = get_param_data(param)
+        param_data = param.data
         if input_dim is not None:
             shard_size = param_data.shape[input_dim]
             start_idx = tp_rank * shard_size
@@ -549,9 +550,8 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
-        # If SparseParameter, repack dense data as sparse.
-        if isinstance(param, SparseParameter):
-            param.pack()
+        if isinstance(param, LazyCompressedParameter):
+            param.compress()
 
     def forward(self, input_):
         # Set up backprop all-reduce.
diff --git a/vllm/model_executor/layers/parameters/__init__.py b/vllm/model_executor/layers/parameters/__init__.py
index 2d41190087a0d..c05cdf56e27a4 100644
--- a/vllm/model_executor/layers/parameters/__init__.py
+++ b/vllm/model_executor/layers/parameters/__init__.py
@@ -1,10 +1,5 @@
-import torch
-from vllm.model_executor.layers.parameters.sparsity import SparseParameter
+from vllm.model_executor.layers.parameters.lazy_compressed import LazyCompressedParameter
 
-
-def get_param_data(param: torch.nn.Parameter) -> torch.Tensor:
-    """Gets parameter data in dense format."""
-    if isinstance(param, SparseParameter):
-        return param.get_dense_data()
-    else:
-        return param.data
+__all__ = [
+    "LazyCompressedParameter",
+]
diff --git a/vllm/model_executor/layers/parameters/lazy_compressed.py b/vllm/model_executor/layers/parameters/lazy_compressed.py
new file mode 100644
index 0000000000000..96e892a03d1fb
--- /dev/null
+++ b/vllm/model_executor/layers/parameters/lazy_compressed.py
@@ -0,0 +1,78 @@
+import numpy
+import torch
+from torch.utils._pytree import tree_map
+
+from typing import Type
+from magic_wand import (CompressedStorageFormat, SparseBitmaskStorageFormat)
+
+
+class LazyCompressedParameter(torch.Tensor):
+
+    @staticmethod
+    def __new__(cls,
+                uncompressed_data: torch.Tensor,
+                storage_format_cls: Type[
+                    CompressedStorageFormat] = SparseBitmaskStorageFormat,
+                compress_transposed: bool = False):
+        self = torch.Tensor._make_wrapper_subclass(
+            cls,
+            size=uncompressed_data.shape,
+            dtype=uncompressed_data.dtype,
+            requires_grad=False)
+        self.storage_format_cls = storage_format_cls
+        self.compressed_data = None
+        self.uncompressed_data = uncompressed_data
+        self.compress_transposed = compress_transposed
+        self._is_param = True
+
+        return self
+
+    @property
+    def has_compressed_data(self) -> bool:
+        return (self.compressed_data is not None)
+
+    @property
+    def has_uncompressed_data(self) -> bool:
+        return (self.uncompressed_data is not None)
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs):
+        ret_storage_format_cls = None
+
+        def unwrap(e):
+            nonlocal ret_storage_format_cls
+            if isinstance(e, LazyCompressedParameter):
+                assert ret_storage_format_cls is None or ret_storage_format_cls == e.storage_format_cls
+                ret_storage_format_cls = e.storage_format_cls
+            return e.uncompressed_data if isinstance(
+                e, LazyCompressedParameter) else e
+
+        rs = func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))
+
+        def wrap(e):
+            if isinstance(e,
+                          torch.Tensor) and ret_storage_format_cls is not None:
+                return LazyCompressedParameter(
+                    e, storage_format_cls=ret_storage_format_cls)
+            return e
+
+        rs = tree_map(wrap, rs)
+        return rs
+
+    def compress(self) -> None:
+        density = torch.count_nonzero(
+            self.uncompressed_data).item() / numpy.prod(self.shape)
+
+        # only compress if we have sufficient sparsity (>=45%), currently
+        # this applies globally across all formats including 2:4
+        if (1 - density) < 0.45:
+            return
+
+        if self.uncompressed_data is None:
+            raise ValueError(
+                "Called compress() but uncompressed_data does not exist.")
+        self.compressed_data = self.storage_format_cls.compress(
+            self.uncompressed_data.t(
+            ) if self.compress_transposed else self.uncompressed_data)
+        del self.uncompressed_data  # free memory
+        self.uncompressed_data = None
diff --git a/vllm/model_executor/layers/parameters/sparsity.py b/vllm/model_executor/layers/parameters/sparsity.py
deleted file mode 100644
index 017fb6b825965..0000000000000
--- a/vllm/model_executor/layers/parameters/sparsity.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import torch
-
-from typing import Type
-from magic_wand import (SparseTensor, CompressedStorageFormat,
-                        SparseBitmaskStorageFormat)
-
-
-class SparseParameter(SparseTensor):
-
-    @staticmethod
-    def __new__(cls,
-                shape: torch.Size,
-                dtype: torch.dtype,
-                storage_format_cls: Type[
-                    CompressedStorageFormat] = SparseBitmaskStorageFormat):
-        assert torch.__version__ > (1,
-                                    10), "SparseTensor requires PyTorch 1.11+"
-
-        self = torch.Tensor._make_wrapper_subclass(cls,
-                                                   size=shape,
-                                                   dtype=dtype,
-                                                   requires_grad=False)
-        self.storage_format_cls = storage_format_cls
-        self.compressed_data = None
-        self.dense_data = None
-        self._is_param = True
-
-        return self
-
-    def has_compressed_data(self) -> bool:
-        return (self.compressed_data is not None)
-
-    def get_dense_data(self) -> torch.Tensor:
-        if self.dense_data is not None:
-            raise ValueError(
-                "Called get_data_dense() but dense_data already exists.")
-        self.dense_data = self._unpack()
-        return self.dense_data
-
-    def _unpack(self) -> torch.Tensor:
-        if self.has_compressed_data():
-            return self.compressed_data.decompress()
-        else:
-            return torch.empty(size=self.shape,
-                               dtype=self.dtype,
-                               device="cuda")
-
-    @classmethod
-    def _copy(cls, arg0, arg1):
-        assert arg0.shape == arg1.shape
-
-        if arg0.has_compressed_data():
-            arg0.compressed_data.copy_(arg1)
-        else:
-            arg0.compressed_data = arg0.storage_format_cls.compress(arg1)
-
-        return arg0
-
-    def copy_(self, src, non_blocking=False):
-        return SparseParameter._copy(self, src)
-
-    def pack(self) -> None:
-        if self.dense_data is None:
-            raise ValueError("Called pack() but dense_data does not exist.")
-        self.copy_(self.dense_data)
-        self.dense_data = None
diff --git a/vllm/model_executor/layers/sparsity/sparse_w16a16.py b/vllm/model_executor/layers/sparsity/sparse_w16a16.py
index 69905eab0c0af..d3a93d9b1d945 100644
--- a/vllm/model_executor/layers/sparsity/sparse_w16a16.py
+++ b/vllm/model_executor/layers/sparsity/sparse_w16a16.py
@@ -5,7 +5,7 @@
 from vllm.model_executor.layers.sparsity.base_config import SparsityConfig
 
 from .sparse_w16a16_linear_method import SparseW16A16LinearMethod
-from magic_wand import (CompressedStorageFormat, SparseBitmaskStorageFormat)
+from magic_wand import (CompressedStorageFormat, SparseBEGemmStorageFormat)
 
 
 class SparseW16A16Config(SparsityConfig):
@@ -23,7 +23,7 @@ def __repr__(self) -> str:
 
     @classmethod
     def get_storage_format_cls(cls) -> Type[CompressedStorageFormat]:
-        return SparseBitmaskStorageFormat
+        return SparseBEGemmStorageFormat
 
     @classmethod
     def get_name(cls) -> str:
diff --git a/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py b/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
index e2fecda663b60..65713a1bf15b3 100644
--- a/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
+++ b/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
@@ -5,9 +5,9 @@
 
 from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
 from vllm.model_executor.layers.sparsity.base_config import SparsityConfig
-from vllm.model_executor.layers.parameters import SparseParameter
-from magic_wand import (CompressedStorageFormat,
-                        SparseSemiStructuredStorageFormat)
+from vllm.model_executor.layers.parameters import LazyCompressedParameter
+from magic_wand import (CompressedStorageFormat, SparseBEGemmStorageFormat)
+from magic_wand.ops import be_ds_gemm
 
 
 class SparseW16A16LinearMethod(LinearMethodBase):
@@ -27,10 +27,15 @@ def create_weights(self, input_size_per_partition: int,
                        output_size_per_partition: int, input_size: int,
                        output_size: int,
                        params_dtype: torch.dtype) -> Dict[str, Any]:
-        weight = SparseParameter(shape=torch.Size(
-            (output_size_per_partition, input_size_per_partition)),
-                                 dtype=params_dtype,
-                                 storage_format_cls=self.storage_format_cls)
+        supports_linear = (self.storage_format_cls !=
+                           SparseBEGemmStorageFormat)
+        weight = LazyCompressedParameter(
+            torch.empty((output_size_per_partition, input_size_per_partition),
+                        dtype=params_dtype),
+            storage_format_cls=self.storage_format_cls,
+            # if we don't support F.linear or something analogous,
+            # transpose when we compress so we can use a basic matmul
+            compress_transposed=not supports_linear)
 
         set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
 
@@ -42,14 +47,28 @@ def apply_weights(
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        sparse_weight = weights["weight"]
+        w: LazyCompressedParameter = weights["weight"]
 
-        if self.storage_format_cls == SparseSemiStructuredStorageFormat:
-            output = F.linear(x, sparse_weight, bias)
-            return output
+        # if we never compressed (likely due to insufficient sparsity),
+        # i.e. have uncompressed_data run normally
+        if w.has_uncompressed_data:
+            assert not w.has_compressed_data
+            output = F.linear(x, w.uncompressed_data, bias)
+        # The current 2:4 implementation was running dense so ignore it
+        #  for now and instead just explicitly decompress as usual
+        # elif self.storage_format_cls == SparseSemiStructuredStorageFormat:
+        #     assert bias is None
+        #     raise NotImplementedError
+        elif self.storage_format_cls == SparseBEGemmStorageFormat:
+            assert bias is None
+            assert w.compress_transposed
+            out_shape = (x.shape[:-1] + (w.shape[0], ))
+            reshaped_x = x.reshape(-1, x.shape[-1])
+            y = be_ds_gemm(reshaped_x, w.compressed_data)
+            return y.reshape(out_shape)
         else:
-
             # Standard matrix multiply
             # Uncompress to dense
-            output = F.linear(x, sparse_weight.to_dense(), bias)
-            return output
+            assert not w.compress_transposed
+            output = F.linear(x, w.compressed_data.decompress(), bias)
+        return output
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index f29b70ac26051..23c352c664d4b 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -19,8 +19,7 @@
                                                      QuantizationConfig)
 from vllm.model_executor.layers.sparsity import (get_sparsity_config,
                                                  SparsityConfig)
-from vllm.model_executor.layers.parameters import (get_param_data,
-                                                   SparseParameter)
+from vllm.model_executor.layers.parameters import LazyCompressedParameter
 
 logger = init_logger(__name__)
 
@@ -299,9 +298,9 @@ def default_weight_loader(param: torch.nn.Parameter,
                           loaded_weight: torch.Tensor) -> None:
     """Default weight loader."""
     assert param.size() == loaded_weight.size()
-    get_param_data(param).copy_(loaded_weight)
-    if isinstance(param, SparseParameter):
-        param.pack()
+    param.data.copy_(loaded_weight)
+    if isinstance(param, LazyCompressedParameter):
+        param.compress()
 
 
 def initialize_dummy_weights(

From 7527b9c029151e3330c4e999e3d528239f969fa4 Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Fri, 16 Feb 2024 09:28:11 -0500
Subject: [PATCH 095/112] Abf149/fix semi structured sparse (#16)

SUMMARY:
- Fix bug whereby 2:4 is not being invoked
- Eschew SparseTensor based implementation

TESTING:
- examples/offline_inference_semi_structured_sparse.py

---------

Co-authored-by: Lucas Wilkinson <wilkinson.lucas@gmail.com>
---
 .../sparsity/sparse_w16a16_linear_method.py   | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py b/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
index 65713a1bf15b3..b194e984a9254 100644
--- a/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
+++ b/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
@@ -6,7 +6,10 @@
 from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
 from vllm.model_executor.layers.sparsity.base_config import SparsityConfig
 from vllm.model_executor.layers.parameters import LazyCompressedParameter
-from magic_wand import (CompressedStorageFormat, SparseBEGemmStorageFormat)
+from magic_wand.semi_structured import (pad_tensor_to_multiple,
+                                        extract_valid_rows)
+from magic_wand import (CompressedStorageFormat, SparseBEGemmStorageFormat,
+                        SparseSemiStructuredStorageFormat)
 from magic_wand.ops import be_ds_gemm
 
 
@@ -54,11 +57,18 @@ def apply_weights(
         if w.has_uncompressed_data:
             assert not w.has_compressed_data
             output = F.linear(x, w.uncompressed_data, bias)
-        # The current 2:4 implementation was running dense so ignore it
-        #  for now and instead just explicitly decompress as usual
-        # elif self.storage_format_cls == SparseSemiStructuredStorageFormat:
-        #     assert bias is None
-        #     raise NotImplementedError
+        elif self.storage_format_cls == SparseSemiStructuredStorageFormat:
+            assert bias is None
+            w_encap = w.compressed_data.encapsulated_torch_sparse_tensor
+            out_shape = (x.shape[:-1] + (w_encap.shape[0], ))
+            reshaped_x, valid_rows_range = pad_tensor_to_multiple(
+                x.reshape(-1, x.shape[-1]), 8)
+            output = F.linear(
+                reshaped_x, w_encap,
+                torch.nn.Parameter(torch.zeros((w_encap.shape[0], ))).to(
+                    reshaped_x.dtype).to(reshaped_x.device)).contiguous()
+            output = extract_valid_rows(output, valid_rows_range)
+            return output.reshape(out_shape)
         elif self.storage_format_cls == SparseBEGemmStorageFormat:
             assert bias is None
             assert w.compress_transposed

From 3c11f56db6bc18558cd3b738e2f25e231a1d703a Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 16 Feb 2024 07:28:02 -0800
Subject: [PATCH 096/112] Enable bfloat16 for sparse_w16a16 (#18)

---
 vllm/model_executor/layers/sparsity/sparse_w16a16.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/sparsity/sparse_w16a16.py b/vllm/model_executor/layers/sparsity/sparse_w16a16.py
index d3a93d9b1d945..7a729ac2badd6 100644
--- a/vllm/model_executor/layers/sparsity/sparse_w16a16.py
+++ b/vllm/model_executor/layers/sparsity/sparse_w16a16.py
@@ -31,7 +31,7 @@ def get_name(cls) -> str:
 
     @classmethod
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        return [torch.half]
+        return [torch.float16, torch.bfloat16]
 
     @classmethod
     def get_min_capability(cls) -> int:

From 8147811509372127e91b3389a181938f73c3cd08 Mon Sep 17 00:00:00 2001
From: Andy Linfoot <78757007+andy-neuma@users.noreply.github.com>
Date: Fri, 16 Feb 2024 12:54:02 -0500
Subject: [PATCH 097/112] seed workflow (#19)

SUMMARY
* add callable seed workflow for initial boundary testing

Co-authored-by: marcella-found <marcella.found@gmail.com>
---
 .github/workflows/build-test.yml | 62 ++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 .github/workflows/build-test.yml

diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
new file mode 100644
index 0000000000000..392a92fa637a0
--- /dev/null
+++ b/.github/workflows/build-test.yml
@@ -0,0 +1,62 @@
+name: build-test
+on:
+  # makes workflow reusable
+  workflow_call:
+    inputs:
+      label:
+        description: "requested runner label (specifies instance)"
+        type: string
+        required: true
+      timeout:
+        description: "time limit for run in minutes "
+        type: string
+        required: true
+      gitref:
+        description: "git commit hash or branch name"
+        type: string
+        required: true
+      python:
+        description: "python version, e.g. 3.10.12"
+        type: string
+        required: true
+
+  # makes workflow manually callable
+  workflow_dispatch:
+    inputs:
+      label:
+        description: "requested runner label (specifies instance)"
+        type: string
+        required: true
+      timeout:
+        description: "time limit for run in minutes "
+        type: string
+        required: true
+      gitref:
+        description: "git commit hash or branch name"
+        type: string
+        required: true
+      python:
+        description: "python version, e.g. 3.10.12"
+        type: string
+        required: true
+
+jobs:
+
+    BUILD-TEST:
+
+        runs-on: ${{ inputs.label }}
+        timeout-minutes: ${{ fromJson(inputs.timeout) }}
+
+        steps:
+
+            - name: checkout
+              id: checkout
+              uses: actions/checkout@v4
+              with:
+                fetch-depth: 0
+                ref: ${{ inputs.gitref }}
+                submodules: recursive
+
+            - name: hello world
+              run: |
+                echo "HELLO WORLD" >> $GITHUB_STEP_SUMMARY

From e802bc215c6df01165a0e48d3858e1d979290cb4 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 16 Feb 2024 14:02:35 -0800
Subject: [PATCH 098/112] Add bias support for sparse layers (#25)

---
 .../sparsity/sparse_w16a16_linear_method.py   | 26 ++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py b/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
index b194e984a9254..7a3b8d30beabd 100644
--- a/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
+++ b/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
@@ -58,24 +58,32 @@ def apply_weights(
             assert not w.has_compressed_data
             output = F.linear(x, w.uncompressed_data, bias)
         elif self.storage_format_cls == SparseSemiStructuredStorageFormat:
-            assert bias is None
             w_encap = w.compressed_data.encapsulated_torch_sparse_tensor
             out_shape = (x.shape[:-1] + (w_encap.shape[0], ))
             reshaped_x, valid_rows_range = pad_tensor_to_multiple(
                 x.reshape(-1, x.shape[-1]), 8)
+            if bias is None:
+                bias = torch.nn.Parameter(
+                    torch.zeros(
+                        (w_encap.shape[0], ),
+                        dtype=reshaped_x.dtype,
+                        device=reshaped_x.device,
+                    ))
             output = F.linear(
-                reshaped_x, w_encap,
-                torch.nn.Parameter(torch.zeros((w_encap.shape[0], ))).to(
-                    reshaped_x.dtype).to(reshaped_x.device)).contiguous()
-            output = extract_valid_rows(output, valid_rows_range)
-            return output.reshape(out_shape)
+                reshaped_x,
+                w_encap,
+                bias,
+            ).contiguous()
+            output = extract_valid_rows(output,
+                                        valid_rows_range).reshape(out_shape)
         elif self.storage_format_cls == SparseBEGemmStorageFormat:
-            assert bias is None
             assert w.compress_transposed
             out_shape = (x.shape[:-1] + (w.shape[0], ))
             reshaped_x = x.reshape(-1, x.shape[-1])
-            y = be_ds_gemm(reshaped_x, w.compressed_data)
-            return y.reshape(out_shape)
+            output = be_ds_gemm(reshaped_x,
+                                w.compressed_data).reshape(out_shape)
+            if bias is not None:
+                output = output + bias
         else:
             # Standard matrix multiply
             # Uncompress to dense

From b97665356b308f780d70212df9bd3c43e1433455 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 20 Feb 2024 16:11:29 -0800
Subject: [PATCH 099/112] Use naive decompress for SM<8.0 (#32)

A warning will be printed out if this case is triggered:
```
WARNING 02-20 22:21:27 sparse_w16a16.py:32] Unstructured sparse kernels are not optimized for NVIDIA SM < 8.0. Naive decompress kernels will be used and can be slower than dense models
```

Works on a T4 with:
```python
from vllm import LLM, SamplingParams

model = LLM(
    "nm-testing/opt-125m-pruned2.4",
    sparsity="sparse_w16a16",
    enforce_eager=True,
    dtype="float16",
)

sampling_params = SamplingParams(max_tokens=100, temperature=0)
outputs = model.generate("Hello my name is", sampling_params=sampling_params)
outputs[0].outputs[0].text
```

Test within colab:
https://colab.research.google.com/drive/15xRvWX5gNaTb00BcaXhxwMm6yxavIKGN?usp=sharing
---
 .../layers/sparsity/sparse_w16a16.py          | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/sparsity/sparse_w16a16.py b/vllm/model_executor/layers/sparsity/sparse_w16a16.py
index 7a729ac2badd6..1eb59bc269c27 100644
--- a/vllm/model_executor/layers/sparsity/sparse_w16a16.py
+++ b/vllm/model_executor/layers/sparsity/sparse_w16a16.py
@@ -2,20 +2,20 @@
 
 import torch
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.sparsity.base_config import SparsityConfig
 
 from .sparse_w16a16_linear_method import SparseW16A16LinearMethod
-from magic_wand import (CompressedStorageFormat, SparseBEGemmStorageFormat)
+from magic_wand import (CompressedStorageFormat, SparseBitmaskStorageFormat,
+                        SparseBEGemmStorageFormat)
 
+logger = init_logger(__name__)
 
-class SparseW16A16Config(SparsityConfig):
-    """Config class for SparseW16A16.
 
-    TODO: Add based on need
-    """
+class SparseW16A16Config(SparsityConfig):
+    """Config class for SparseW16A16."""
 
     def __init__(self) -> None:
-        # TODO: Add new configs here
         pass
 
     def __repr__(self) -> str:
@@ -23,7 +23,15 @@ def __repr__(self) -> str:
 
     @classmethod
     def get_storage_format_cls(cls) -> Type[CompressedStorageFormat]:
-        return SparseBEGemmStorageFormat
+        cuda_compute_capability = torch.cuda.get_device_capability()
+        if cuda_compute_capability >= (8, 0):
+            return SparseBEGemmStorageFormat
+        else:
+            # For NVIDIA SM < 8.0
+            logger.warning("Unstructured sparse kernels are not optimized for "
+                           "NVIDIA SM < 8.0. Naive decompress kernels will be "
+                           "used and can be slower than dense models")
+            return SparseBitmaskStorageFormat
 
     @classmethod
     def get_name(cls) -> str:
@@ -35,8 +43,7 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
 
     @classmethod
     def get_min_capability(cls) -> int:
-        # TODO: Update after checks on more GPUs
-        return 80
+        return 70
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:

From 78ba5c1d004c52914b9942c04f32ea3b5edad61b Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Wed, 21 Feb 2024 18:31:57 +0530
Subject: [PATCH 100/112] Varun/benchmark workflow (#28)

Add initial bechmark workflow

---------

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .github/actions/nm-benchmark/action.yml       | 17 +++++
 .github/actions/nm-workflow-info/action.yml   | 25 +++++++
 .github/workflows/nm-benchmark.yml            | 72 +++++++++++++++++++
 .../workflows/scripts/nm-run-benchmarks.sh    | 15 ++++
 4 files changed, 129 insertions(+)
 create mode 100644 .github/actions/nm-benchmark/action.yml
 create mode 100644 .github/actions/nm-workflow-info/action.yml
 create mode 100644 .github/workflows/nm-benchmark.yml
 create mode 100644 .github/workflows/scripts/nm-run-benchmarks.sh

diff --git a/.github/actions/nm-benchmark/action.yml b/.github/actions/nm-benchmark/action.yml
new file mode 100644
index 0000000000000..37fb2a60e4c8b
--- /dev/null
+++ b/.github/actions/nm-benchmark/action.yml
@@ -0,0 +1,17 @@
+name: run vllm benchmarks
+description: 'run vllm benchmarks'
+inputs:
+  output_directory:
+    description: 'output directory to store the benchmark results'
+    required: true
+runs:
+  using: composite
+  steps:
+  - id: benchmark
+    run: |
+      mkdir -p ${{ inputs.output_directory }}
+      SUCCESS=0
+      .github/workflows/scripts/nm-run-benchmarks.sh ${{ inputs.output_directory }} || SUCCESS=$?
+      echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT"
+      exit ${SUCCESS}
+    shell: bash
diff --git a/.github/actions/nm-workflow-info/action.yml b/.github/actions/nm-workflow-info/action.yml
new file mode 100644
index 0000000000000..76628f26f2831
--- /dev/null
+++ b/.github/actions/nm-workflow-info/action.yml
@@ -0,0 +1,25 @@
+name: workflow info
+description: 'give a brief summary of workflow parameters'
+inputs:
+  label:
+    description: "requested runner label (specifies instance)"
+    required: true
+  timeout:
+    description: "time limit for run in minutes "
+    required: true
+  gitref:
+    description: "git commit hash or branch name"
+    required: true
+runs:
+  using: composite
+  steps:
+    - run: |
+        echo "workflow started ..."
+        echo "label: '${{ inputs.label }}'"
+        echo "github actor: '${{ github.actor }}'"
+        echo "repository: '${{ github.repository }}'"
+        echo "gitref: '${{ inputs.gitref }}'"
+        echo "branch name: '${{ github.ref_name }}'"
+        echo "user on instance: $(whoami)"
+        echo "workflow timeout: ${{ inputs.timeout }} (min)"
+      shell: bash
\ No newline at end of file
diff --git a/.github/workflows/nm-benchmark.yml b/.github/workflows/nm-benchmark.yml
new file mode 100644
index 0000000000000..d7a2c5f999b6e
--- /dev/null
+++ b/.github/workflows/nm-benchmark.yml
@@ -0,0 +1,72 @@
+name: benchmark 
+on:
+  workflow_dispatch:
+    inputs:
+      label:
+        description: "requested runner label (specifies instance)"
+        type: string
+        required: true
+      timeout:
+        description: "approximate number of minutes to keep instance up (should be at least 20)."
+        type: string
+        required: true
+      gitref:
+        description: "git commit hash or branch name"
+        type: string
+        required: true
+
+jobs:
+  BENCHMARK:
+
+    runs-on: ${{ inputs.label }}
+    timeout-minutes: ${{ fromJSON(inputs.timeout) }}
+
+    steps:
+      - name: checkout repository code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ inputs.gitref }}
+          submodules: recursive
+
+      - name: workflow info
+        uses: ./.github/actions/nm-workflow-info/
+        with:
+          gitref: ${{ inputs.gitref }}
+          label: ${{ inputs.label }}
+          timeout: ${{ inputs.timeout }}
+  
+      # Call the `build` action when available
+      #- name: build
+      #  id: build
+      #  uses: ./.github/actions/build/
+
+      - name: run benchmarks
+        uses: ./.github/actions/nm-benchmark/
+        with:
+          output_directory: benchmark-results
+
+      - name: store benchmark result artifacts
+        uses: actions/upload-artifact@v4
+        if: success() || failure()
+        with:
+          name: ${{ github.run_id }}-${{ inputs.label }}
+          path: benchmark-results
+          retention-days: 90
+
+      #######################################################
+      # TODO (Varun) : Remove pause once things are automated
+      - name: announce pause
+        if: success() || failure()
+        run: |
+          M=${{ inputs.timeout }}
+          R=$((M - 15))
+          S=$((R * 60))
+          echo "pausing for, ${R} minutes"
+
+      - name: pause workflow
+        run: |
+          M=${{ inputs.timeout }}
+          R=$((M - 15))
+          S=$((R * 60))
+          sleep $S
diff --git a/.github/workflows/scripts/nm-run-benchmarks.sh b/.github/workflows/scripts/nm-run-benchmarks.sh
new file mode 100644
index 0000000000000..81b4af83c0031
--- /dev/null
+++ b/.github/workflows/scripts/nm-run-benchmarks.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -e
+set -u
+  
+if [ $# -ne 1 ];
+then
+  echo "run_benchmarks needs exactly 1 argument - The output path to store the benchmark results"
+  exit -1
+fi
+  
+output_directory=$1
+
+touch $ouptut_directory/bench_test_1.txt
+touch $ouptut_directory/bench_test_2.txt
\ No newline at end of file

From fbfd764478a1d9b989c21b6c266e42d0b3532bd5 Mon Sep 17 00:00:00 2001
From: Andy Linfoot <78757007+andy-neuma@users.noreply.github.com>
Date: Wed, 21 Feb 2024 12:50:46 -0500
Subject: [PATCH 101/112] initial GHA workflows for "build test" and "remote
 push" (#27)

SUMMARY:
* initial set of "actions with a little a" that are the building blocks
for eventual CI system
* "build test" workflow
* "remote push" workflow on `a10g`
* update some requirement files to have packages listed in alphabetical
order

NOTE: this PR is still somewhat nebulas as i'm still working through
building and testing "neuralmagic-vllm" in our automation environment.

TEST:
currently, i'm working through various workflow components, i.e.
"actions with a little a". the bits making up the actions in this PR
have been constructed from my notes along the way.

we can do a "complete" run that includes: linting, building, installing,
and running tests.

GHA link ...
https://github.com/neuralmagic/neuralmagic-vllm/actions/runs/7975058564
`testmo` ... https://neuralmagic.testmo.net/automation/runs/view/8097

Latest GHA link ...
https://github.com/neuralmagic/neuralmagic-vllm/actions/runs/7992489982

---------

Co-authored-by: andy-neuma <andy@neuralmagic.com>
---
 .github/actions/nm-build-vllm/action.yml      | 33 +++++++
 .github/actions/nm-lint-python/action.yml     | 23 +++++
 .github/actions/nm-mypy/action.yml            | 16 ++++
 .github/actions/nm-run-summary/action.yml     | 48 ++++++++++
 .github/actions/nm-set-env/action.yml         | 21 +++++
 .github/actions/nm-set-python/action.yml      | 30 ++++++
 .github/actions/nm-test-vllm/action.yml       | 36 ++++++++
 .../actions/nm-testmo-run-complete/action.yml | 35 +++++++
 .../actions/nm-testmo-run-create/action.yml   | 59 ++++++++++++
 .../nm-testmo-run-submit-thread/action.yml    | 59 ++++++++++++
 .github/scripts/determine-threading           | 49 ++++++++++
 .github/scripts/step-status                   | 14 +++
 .github/workflows/build-test.yml              | 92 ++++++++++++++++++-
 .github/workflows/remote-push.yml             | 30 ++++++
 .gitignore                                    |  4 +
 requirements-dev.txt                          | 14 +--
 requirements-neuron.txt                       | 12 +--
 tests/models/test_mistral.py                  |  1 +
 tests/models/test_models.py                   |  1 +
 19 files changed, 562 insertions(+), 15 deletions(-)
 create mode 100644 .github/actions/nm-build-vllm/action.yml
 create mode 100644 .github/actions/nm-lint-python/action.yml
 create mode 100644 .github/actions/nm-mypy/action.yml
 create mode 100644 .github/actions/nm-run-summary/action.yml
 create mode 100644 .github/actions/nm-set-env/action.yml
 create mode 100644 .github/actions/nm-set-python/action.yml
 create mode 100644 .github/actions/nm-test-vllm/action.yml
 create mode 100644 .github/actions/nm-testmo-run-complete/action.yml
 create mode 100644 .github/actions/nm-testmo-run-create/action.yml
 create mode 100644 .github/actions/nm-testmo-run-submit-thread/action.yml
 create mode 100755 .github/scripts/determine-threading
 create mode 100755 .github/scripts/step-status
 create mode 100644 .github/workflows/remote-push.yml

diff --git a/.github/actions/nm-build-vllm/action.yml b/.github/actions/nm-build-vllm/action.yml
new file mode 100644
index 0000000000000..780c2f99de3c6
--- /dev/null
+++ b/.github/actions/nm-build-vllm/action.yml
@@ -0,0 +1,33 @@
+name: build neuralmagic-vllm
+description: 'build neuralmagic-vllm'
+inputs:
+  Gi_per_thread:
+    description: 'requested GiB to reserve per thread'
+    required: true
+  python:
+    description: 'python version, e.g. 3.10.12'
+    required: true
+  venv:
+    description: 'name for python virtual environment'
+    required: true
+outputs:
+  status:
+    description: "final build status from 'pip install -e'"
+    value: ${{ steps.build.outputs.status }}
+runs:
+  using: composite
+  steps:
+  - id: build
+    run: |
+      # TODO: this is a hack ... fix it later
+      # pyenv hardcoded ... python version hardcoded ...
+      COMMIT=${{ github.sha }}
+      VENV="${{ inputs.venv }}-${COMMIT:0:7}"
+      source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
+      pip3 install --index-url http://192.168.201.226:8080/ --trusted-host 192.168.201.226 magic-wand
+      pip3 install -r requirements.txt
+      SUCCESS=0
+      pip3 install -e . || SUCCESS=$?
+      echo "status=${SUCCESS}" >> "$GITHUB_OUTPUT"
+      exit ${SUCCESS}
+    shell: bash
diff --git a/.github/actions/nm-lint-python/action.yml b/.github/actions/nm-lint-python/action.yml
new file mode 100644
index 0000000000000..bcc27532dfb98
--- /dev/null
+++ b/.github/actions/nm-lint-python/action.yml
@@ -0,0 +1,23 @@
+name: lint python
+description: "runs 'ruff' and reports errors"
+outputs:
+  status:
+    description: "return code from 'ruff'"
+    value: ${{ steps.ruff.outputs.status }}
+runs:
+  using: composite
+  steps:
+    - id: ruff
+      run: |
+        SUCCESS=0
+        PYTHON_FILES=$(ruff .) || SUCCESS=$?
+        if [ ${SUCCESS} -ne 0 ]; then
+          echo "__Python Lint Failures:__" >> $GITHUB_STEP_SUMMARY
+          echo "${PYTHON_FILES}" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo -e "lint: \xE2\x9D\x8C __FAILED__" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+        fi
+        echo "status=${SUCCESS}" >> "$GITHUB_OUTPUT"
+        exit ${SUCCESS}
+      shell: bash
diff --git a/.github/actions/nm-mypy/action.yml b/.github/actions/nm-mypy/action.yml
new file mode 100644
index 0000000000000..ec19d7f1fb3f5
--- /dev/null
+++ b/.github/actions/nm-mypy/action.yml
@@ -0,0 +1,16 @@
+name: mypy
+description: "run 'mypy' and report final status"
+outputs:
+  status:
+    description: "final status from 'mypy'"
+    value: ${{ steps.mypy.outputs.status }}
+runs:
+  using: composite
+  steps:
+  - id: mypy
+    run: |
+      SUCCESS=0
+      mypy || SUCCESS=$?
+      echo "status=${SUCCESS}" >> "$GITHUB_OUTPUT"
+      exit ${SUCCESS}
+    shell: bash
diff --git a/.github/actions/nm-run-summary/action.yml b/.github/actions/nm-run-summary/action.yml
new file mode 100644
index 0000000000000..d3234beb748e6
--- /dev/null
+++ b/.github/actions/nm-run-summary/action.yml
@@ -0,0 +1,48 @@
+name: summary
+description: 'creates a neuralmagic GHA run summary'
+inputs:
+  label:
+    description: 'GHA runner label'
+    required: true
+  gitref:
+    description: 'git commit hash or branch name'
+    required: true
+  testmo_run_url:
+    description: 'testmo URL for this particular run'
+    required: true
+  python:
+    description: 'python version info'
+    required: true
+  lint_status:
+    description: 'status from python lint step'
+    required: true
+  build_status:
+    description: 'status from build step'
+    required: true
+  test_status:
+    description: 'status from test step'
+    required: true
+runs:
+  using: composite
+  steps:
+  - run: |
+      LINT_STATUS=${{ inputs.lint_status }}
+      LINT_EMOJI=$(./.github/scripts/step-status ${LINT_STATUS})
+      BUILD_STATUS=${{ inputs.build_status }}
+      BUILD_EMOJI=$(./.github/scripts/step-status ${BUILD_STATUS})
+      TEST_STATUS=${{ inputs.test_status }}
+      TEST_EMOJI=$(./.github/scripts/step-status ${TEST_STATUS})
+      echo "testmo URL: ${{ inputs.testmo_run_url }}" >> $GITHUB_STEP_SUMMARY
+      echo ""
+      echo "| Parameter | |" >> $GITHUB_STEP_SUMMARY
+      echo "|---|---|" >> $GITHUB_STEP_SUMMARY
+      echo "| label: | \`${{ inputs.label }}\` |" >> $GITHUB_STEP_SUMMARY
+      echo "| git sha: | \`${{ github.sha }}\` |" >> $GITHUB_STEP_SUMMARY
+      echo "| github actor: | '${{ github.actor }}' |" >> $GITHUB_STEP_SUMMARY
+      echo "| gitref: | '${{ inputs.gitref }}' |" >> $GITHUB_STEP_SUMMARY
+      echo "| branch name: | '${{ github.ref_name }}' |" >> $GITHUB_STEP_SUMMARY
+      echo "| python: | ${{ inputs.python }} |" >> $GITHUB_STEP_SUMMARY
+      echo "| lint: | ${LINT_EMOJI} |" >> $GITHUB_STEP_SUMMARY
+      echo "| build: | ${BUILD_EMOJI} |" >> $GITHUB_STEP_SUMMARY
+      echo "| test: | ${TEST_EMOJI} |" >> $GITHUB_STEP_SUMMARY
+    shell: bash
diff --git a/.github/actions/nm-set-env/action.yml b/.github/actions/nm-set-env/action.yml
new file mode 100644
index 0000000000000..d5b108d97ba4a
--- /dev/null
+++ b/.github/actions/nm-set-env/action.yml
@@ -0,0 +1,21 @@
+name: set neuralmagic env
+description: 'sets environment variables for neuralmagic'
+inputs:
+  hf_home:
+    description: 'Hugging Face home'
+    required: true
+runs:
+  using: composite
+  steps:
+  - run: |
+      echo "HF_HOME=${HF_HOME_TOKEN}" >> $GITHUB_ENV
+      echo "TORCH_CUDA_ARCH_LIST=8.0+PTX" >> $GITHUB_ENV
+      echo "PYENV_ROOT=/usr/local/apps/pyenv" >> $GITHUB_ENV
+      echo "XDG_CONFIG_HOME=/usr/local/apps" >> $GITHUB_ENV
+      WHOAMI=$(whoami)
+      echo "PATH=/usr/local/apps/pyenv/plugins/pyenv-virtualenv/shims:/usr/local/apps/pyenv/shims:/usr/local/apps/pyenv/bin:/usr/local/apps/nvm/versions/node/v16.20.2/bin:/usr/local/cuda-12.1/bin:/usr/local/cuda-12.1/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/${WHOAMI}/.local/bin:" >> $GITHUB_ENV
+      echo "LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64::/usr/local/cuda-12.1/lib64:" >> $GITHUB_ENV
+      echo "PROJECT_ID=12" >> $GITHUB_ENV
+    env:
+        HF_HOME_TOKEN: ${{ inputs.hf_home }}
+    shell: bash
diff --git a/.github/actions/nm-set-python/action.yml b/.github/actions/nm-set-python/action.yml
new file mode 100644
index 0000000000000..7b37add439e35
--- /dev/null
+++ b/.github/actions/nm-set-python/action.yml
@@ -0,0 +1,30 @@
+name: set python
+description: 'sets python version and creates venv for neuralmagic'
+inputs:
+  python:
+    description: 'python version, e.g. 3.10.12'
+    required: true
+  venv:
+    description: 'name for python virtual environment'
+    required: true
+outputs:
+  version:
+    description: "result from 'python --version'"
+    value: ${{ steps.set_python.outputs.version }}
+runs:
+  using: composite
+  steps:
+    - id: set_python
+      run: |
+        command -v pyenv
+        pyenv root
+        pyenv versions
+        pyenv local ${{ inputs.python }}
+        COMMIT=${{ github.sha }}
+        VENV="${{ inputs.venv }}-${COMMIT:0:7}"
+        pyenv virtualenv ${VENV}
+        source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
+        pyenv versions
+        VERSION=$(python --version)
+        echo "version=${VERSION}" >> "$GITHUB_OUTPUT"
+      shell: bash
diff --git a/.github/actions/nm-test-vllm/action.yml b/.github/actions/nm-test-vllm/action.yml
new file mode 100644
index 0000000000000..27dae15df0332
--- /dev/null
+++ b/.github/actions/nm-test-vllm/action.yml
@@ -0,0 +1,36 @@
+name: test neuralmagic-vllm
+description: "test neuralmagic-vllm via, 'pytest tests/'"
+inputs:
+  test_directory:
+    description: 'test directory, path is relative to neuralmagic-vllm'
+    required: true
+  test_xml:
+    description: 'filename for xml test results'
+    required: true
+  python:
+    description: 'python version, e.g. 3.10.12'
+    required: true
+  venv:
+    description: 'name for python virtual environment'
+    required: true
+outputs:
+  status:
+    description: "final status from 'pytest tests/'"
+    value: ${{ steps.test.outputs.status }}
+runs:
+  using: composite
+  steps:
+  - id: test
+    run: |
+      SUCCESS=0
+      # TODO: this is a hack ... fix it later
+      # pyenv hardcoded ... python version hardcoded ...
+      COMMIT=${{ github.sha }}
+      VENV="${{ inputs.venv }}-${COMMIT:0:7}"
+      source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
+      pip3 install --index-url http://192.168.201.226:8080/ --trusted-host 192.168.201.226 magic-wand
+      pip3 install -r requirements-dev.txt
+      pytest --junitxml=${{ inputs.test_xml }} ${{ inputs.test_directory }} || SUCCESS=$?
+      echo "status=${SUCCESS}" >> "$GITHUB_OUTPUT"
+      exit ${SUCCESS}
+    shell: bash
diff --git a/.github/actions/nm-testmo-run-complete/action.yml b/.github/actions/nm-testmo-run-complete/action.yml
new file mode 100644
index 0000000000000..0f89cd8800211
--- /dev/null
+++ b/.github/actions/nm-testmo-run-complete/action.yml
@@ -0,0 +1,35 @@
+name: complete testmo run
+description: 'complete neuralmagic testmo run'
+inputs:
+  testmo_url:
+    description: 'testmo URL'
+    required: true
+  testmo_token:
+    description: 'testmo token'
+    required: true
+  testmo_run_id:
+    description: 'testmo run id'
+    required: true
+runs:
+  using: "composite"
+  steps:
+    - run: |
+        echo "completing TESTMO run ..."
+        ## CHECK testmo_url and token
+        if [[ -z "${TESTMO_URL}" ]]; then
+          echo "The TESTMO_URL secret is not defined for this repository"
+          exit 1
+        fi
+        if [[ -z "${TESTMO_TOKEN}" ]]; then
+          echo "The TESTMO_TOKEN secret is not defined for this repository"
+          exit 1
+        fi
+        ## complete testmo run
+        npx testmo automation:run:complete \
+          --instance "${TESTMO_URL}" \
+          --run-id "${TESTMO_RUN_ID}"
+      env:
+        TESTMO_URL: ${{ inputs.testmo_url }}
+        TESTMO_TOKEN: ${{ inputs.testmo_token }}
+        TESTMO_RUN_ID: ${{ inputs.testmo_run_id }}
+      shell: bash
diff --git a/.github/actions/nm-testmo-run-create/action.yml b/.github/actions/nm-testmo-run-create/action.yml
new file mode 100644
index 0000000000000..9066a8c2f7dad
--- /dev/null
+++ b/.github/actions/nm-testmo-run-create/action.yml
@@ -0,0 +1,59 @@
+name: create testmo run
+description: 'create neuralmagic testmo run and return its ID'
+inputs:
+  testmo_url:
+    description: 'testmo URL'
+    required: true
+  testmo_token:
+    description: 'testmo token'
+    required: true
+  source:
+    description: "source for testmo, e.g. 'build-test'"
+    required: true
+outputs:
+  id:
+    description: 'testmo run id'
+    value: ${{ steps.testmo_id.outputs.id }}
+runs:
+  using: "composite"
+  steps:
+    - name: create run
+      id: testmo_id
+      run: |
+        echo "creating TESTMO run ..."
+        sudo mkdir -p ${HOME}/.npm
+        sudo chown -R $(whoami):$(whoami) ${HOME}/.npm
+        ## adjust resources and GHA link
+        npx testmo automation:resources:add-field --name git --type string --value ${GITHUB_SHA:0:7} --resources resources.json
+        RUN_URL="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID"
+        ACTOR=${GITHUB_ACTOR}
+        BUILD=${ACTOR}-$(whoami)-gpu
+        echo "name: ${BUILD}"
+        echo "url: ${RUN_URL}"
+        npx testmo automation:resources:add-link --name ${BUILD} --url ${RUN_URL} --resources resources.json
+        ## CHECK testmo_url and token
+        if [[ -z "${TESTMO_URL}" ]]; then
+          echo "The TESTMO_URL secret is not defined for this repository"
+          exit 1
+        fi
+        if [[ -z "${TESTMO_TOKEN}" ]]; then
+          echo "The TESTMO_TOKEN secret is not defined for this repository"
+          exit 1
+        fi
+        ## construct name
+        BRANCH_NAME=${GITHUB_REF_NAME}
+        TMP=${ACTOR}-${BRANCH_NAME}
+        TESTMO_RUN_NAME=$(echo ${TMP} | awk '{print tolower($0)}')
+        echo "test run name: ${TESTMO_RUN_NAME}"
+        ## create testmo run
+        TESTMO_ID=$(npx testmo automation:run:create \
+          --instance "${TESTMO_URL}" \
+          --project-id "${PROJECT_ID}" \
+          --name "${TESTMO_RUN_NAME}" \
+          --source "${{ inputs.source }}" \
+          --resources resources.json)
+        echo "id=${TESTMO_ID}" >> "${GITHUB_OUTPUT}"
+      env:
+        TESTMO_URL: ${{ inputs.testmo_url }}
+        TESTMO_TOKEN: ${{ inputs.testmo_token }}
+      shell: bash
diff --git a/.github/actions/nm-testmo-run-submit-thread/action.yml b/.github/actions/nm-testmo-run-submit-thread/action.yml
new file mode 100644
index 0000000000000..b47c882e591d0
--- /dev/null
+++ b/.github/actions/nm-testmo-run-submit-thread/action.yml
@@ -0,0 +1,59 @@
+name: submit results to testmo run
+description: 'asynchronously submit step results to neuralmagic testmo run'
+inputs:
+  testmo_url:
+    description: 'testmo URL'
+    required: true
+  testmo_token:
+    description: 'testmo token'
+    required: true
+  testmo_run_id:
+    description: 'testmo run id'
+    required: true
+  results:
+    description: "directory of JUnit '*.xml' formatted result files"
+    required: true
+  step_status:
+    description: 'status of reported step'
+    required: true
+outputs:
+  status:
+    description: "status of updating testmo. if there was no update, then 'success' is returned."
+    value: ${{ steps.submit_thread.outputs.status }}
+runs:
+  using: "composite"
+  steps:
+    - id: submit_thread
+      run: |
+        ls -al
+        ## if results is non-existent or there aren't results, then nothing to submit ...
+        REPORT=1
+        RESULTS=
+        if [[ ! -d ${{ inputs.results }} ]]; then
+          REPORT=0
+        else
+          RESULTS=$(find ${{ inputs.results }} -type f -name "*.xml")
+        fi
+        if [[ -z "${RESULTS}" ]]; then
+          REPORT=0
+        fi
+        ## submit results?
+        SUCCESS=0
+        if [ ${REPORT} -eq 1 ]; then
+          echo "submitting results to TESTMO run ..."
+          ## not checking testmo_url and token as this should be
+          ## called between "create" and "complete"
+          npx testmo automation:run:submit-thread \
+            --instance ${TESTMO_URL} \
+            --run-id ${TESTMO_RUN_ID} \
+            --results ${RESULTS} \
+            -- ./.github/scripts/step-status ${{ inputs.step_status }}
+            SUCCESS=$?
+        fi
+        echo "status=${SUCCESS}" >> "$GITHUB_OUTPUT"
+        exit ${SUCCESS}
+      env:
+        TESTMO_URL: ${{ inputs.testmo_url }}
+        TESTMO_TOKEN: ${{ inputs.testmo_token }}
+        TESTMO_RUN_ID: ${{ inputs.testmo_run_id }}
+      shell: bash
diff --git a/.github/scripts/determine-threading b/.github/scripts/determine-threading
new file mode 100755
index 0000000000000..11354772a541b
--- /dev/null
+++ b/.github/scripts/determine-threading
@@ -0,0 +1,49 @@
+#!/bin/bash -e
+
+usage() {
+    echo "Usage: ${0} <options>"
+    echo
+    echo "  -G    - number of GiB per processor (includes hyperthreads, default is 1 GiB)."
+    echo "  -h    - this list of options"
+    echo
+    exit 1
+}
+
+Gi_PER_PROC=1
+
+while getopts "hG:" OPT; do
+    case "${OPT}" in
+	h)
+	    usage
+	    ;;
+	G)
+	    Gi_PER_PROC="${OPTARG}"
+	    ;;
+    esac
+done
+
+
+# this includes hyperthreads, since we're only compiling code
+# ... not doing floating point calculations
+UNAME=$(uname)
+ALL_PROC=1
+TOTAL_MEM=0
+
+ALL_PROC=$(nproc --all)
+TOTAL_MEM=$(grep MemTotal /proc/meminfo)
+TOTAL_MEM=${TOTAL_MEM##MemTotal:}
+TOTAL_MEM=${TOTAL_MEM%%kB}
+TOTAL_MEM=$(echo $TOTAL_MEM | xargs)
+TOTAL_MEM=$((TOTAL_MEM / 1048576))
+
+USE_PROC=$((TOTAL_MEM / Gi_PER_PROC))
+
+# constrain to have at least 1 Gi per processor
+USE_PROC=$((USE_PROC > ALL_PROC ? ALL_PROC : USE_PROC))
+
+# if unable to determine total memory, then just set USE_PROC to 1
+if [ ${TOTAL_MEM} -eq 0 ]; then
+    USE_PROC=1
+fi
+
+echo ${USE_PROC}
diff --git a/.github/scripts/step-status b/.github/scripts/step-status
new file mode 100755
index 0000000000000..b07f17517be2b
--- /dev/null
+++ b/.github/scripts/step-status
@@ -0,0 +1,14 @@
+#!/bin/bash -e
+
+# echo "green encased checkmark" if "${1} == 0"
+# echo "red X"                   if "${1} != 0"
+
+STEP_STATUS=${1}
+
+if [ $STEP_STATUS -eq 0 ]; then
+    # green check
+    echo -e "\xE2\x9C\x85"
+else
+    # red x
+    echo -e "\xE2\x9D\x8C"
+fi
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
index 392a92fa637a0..26a9b5cb89bcd 100644
--- a/.github/workflows/build-test.yml
+++ b/.github/workflows/build-test.yml
@@ -57,6 +57,94 @@ jobs:
                 ref: ${{ inputs.gitref }}
                 submodules: recursive
 
-            - name: hello world
+            - name: setenv
+              id: setenv
+              uses: ./.github/actions/nm-set-env/
+              with:
+                hf_home: ${{ secrets.NM_HF_HOME }}
+
+            - name: set python
+              id: set_python
+              uses: ./.github/actions/nm-set-python/
+              with:
+                python: ${{ inputs.python }}
+                venv: TEST
+
+            # TODO: testmo source is currently hardcoded.
+            - name: create testmo run
+              id: create_testmo_run
+              uses: ./.github/actions/nm-testmo-run-create/
+              if: success() || failure()
+              with:
+                testmo_url: https://neuralmagic.testmo.net
+                testmo_token: ${{ secrets.TESTMO_TEST_TOKEN }}
+                source: 'build-test'
+
+            - name: python lint
+              id: lint
+              uses: ./.github/actions/nm-lint-python/
+
+            - name: build
+              id: build
+              uses: ./.github/actions/nm-build-vllm/
+              with:
+                Gi_per_thread: 1
+                python: ${{ inputs.python }}
+                venv: TEST
+
+            - name: test
+              id: test
+              uses: ./.github/actions/nm-test-vllm/
+              with:
+                test_directory: tests
+                test_xml: test-results/all_tests.xml
+                python: ${{ inputs.python }}
+                venv: TEST
+
+            - name: report test results
+              id: report_test
+              uses: ./.github/actions/nm-testmo-run-submit-thread/
+              if: success() || failure()
+              with:
+                testmo_url: https://neuralmagic.testmo.net
+                testmo_token: ${{ secrets.TESTMO_TEST_TOKEN }}
+                testmo_run_id: ${{ steps.create_testmo_run.outputs.id }}
+                results: test-results
+                step_status: ${{ steps.test.outputs.status }}
+
+            - name: summary
+              uses: ./.github/actions/nm-run-summary/
+              if: success() || failure()
+              with:
+                label: ${{ inputs.label }}
+                gitref: ${{ inputs.gitref }}
+                testmo_run_url: https://neuralmagic.testmo.net/automation/runs/view/${{ steps.create_testmo_run.outputs.id }}
+                python: ${{ steps.set_python.outputs.version }}
+                lint_status: ${{ steps.lint.outputs.status }}
+                build_status: ${{ steps.build.outputs.status }}
+                test_status: ${{ steps.test.outputs.status }}
+
+            - name: run status
+              id: run_status
+              if: success() || failure()
+              env:
+                CHECKOUT: ${{ steps.checkout.outcome }}
+                LINT_STATUS: ${{ steps.lint.outputs.status }}
+                BUILD_STATUS: ${{ steps.build.outputs.status }}
+                TEST_STATUS: ${{ steps.test.outputs.status }}
               run: |
-                echo "HELLO WORLD" >> $GITHUB_STEP_SUMMARY
+                  echo "checkout status: ${CHECKOUT}"
+                  if [[ "${CHECKOUT}" != *"success"* ]]; then exit 1; fi
+                  if [ ${LINT_STATUS} -ne 0 ]; then exit 1; fi
+                  if [ ${BUILD_STATUS} -ne 0 ]; then exit 1; fi
+                  echo "build status: ${BUILD_STATUS}"
+                  if [ ${TEST_STATUS} -ne 0 ]; then exit 1; fi
+                  echo "test status: ${TEST_STATUS}"
+
+            - name: complete testmo run
+              uses: ./.github/actions/nm-testmo-run-complete/
+              if: success() || failure()
+              with:
+                testmo_url: https://neuralmagic.testmo.net
+                testmo_token: ${{ secrets.TESTMO_TEST_TOKEN }}
+                testmo_run_id: ${{ steps.create_testmo_run.outputs.id }}
diff --git a/.github/workflows/remote-push.yml b/.github/workflows/remote-push.yml
new file mode 100644
index 0000000000000..c10b386ceb23e
--- /dev/null
+++ b/.github/workflows/remote-push.yml
@@ -0,0 +1,30 @@
+name: remote push
+run-name: ${{ github.actor }} verifying branch '${{ github.ref }}'
+on:
+  push:
+    branches-ignore:
+      - main
+
+concurrency:
+  group: ${{ github.head_ref || github.ref_name }}
+  cancel-in-progress: true
+
+jobs:
+
+    # TODO: expand python matrix later, once CI system has
+    #       matured.
+    # TODO: adjust timeout after we get a bit more experience.
+    #       making it 60 is a bit permissive.
+
+    # TODO: enable this later
+    AWS-AVX2-32G-A10G-24G:
+        strategy:
+            matrix:
+                python: [3.10.12]
+        uses: ./.github/workflows/build-test.yml
+        with:
+            label: aws-avx2-32G-a10g-24G
+            timeout: 60
+            gitref: '${{ github.ref }}'
+            python: ${{ matrix.python }}
+        secrets: inherit
diff --git a/.gitignore b/.gitignore
index b5195629e5cf3..9a2948ae01a60 100644
--- a/.gitignore
+++ b/.gitignore
@@ -184,3 +184,7 @@ _build/
 
 # Benchmark dataset
 *.json
+
+# pyenv
+.python-version
+
diff --git a/requirements-dev.txt b/requirements-dev.txt
index f8126008d0794..cbf099ab73f50 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,7 +1,7 @@
 # formatting
-yapf==0.32.0
-toml==0.10.2
 ruff==0.1.5
+toml==0.10.2
+yapf==0.32.0
 
 # type checking
 mypy==0.991
@@ -10,12 +10,12 @@ types-requests
 types-setuptools
 
 # testing
-pytest
-pytest-forked
-pytest-asyncio
-httpx
 einops # required for MPT
 flash_attn # required for HuggingFace's llama implementation
+httpx
 openai
+pytest
+pytest-asyncio
+pytest-forked
+ray
 requests
-ray
\ No newline at end of file
diff --git a/requirements-neuron.txt b/requirements-neuron.txt
index 3f30ed08f037d..da9c2de767af1 100644
--- a/requirements-neuron.txt
+++ b/requirements-neuron.txt
@@ -1,9 +1,9 @@
-sentencepiece  # Required for LLaMA tokenizer.
+aioprometheus[starlette]
+fastapi
+neuronx-cc
 numpy
-transformers-neuronx >= 0.9.0
+pydantic >= 2.0  # Required for OpenAI server.
+sentencepiece  # Required for LLaMA tokenizer.
 torch-neuronx >= 2.1.0
-neuronx-cc
-fastapi
+transformers-neuronx >= 0.9.0
 uvicorn[standard]
-pydantic >= 2.0  # Required for OpenAI server.
-aioprometheus[starlette]
diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py
index 83316fcb7469d..d1c7b5b99d7aa 100644
--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -9,6 +9,7 @@
 ]
 
 
+@pytest.mark.skip("running these on a10g results in process getting killed")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [128])
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index e44452e9893cf..a60cfb223b668 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -22,6 +22,7 @@
 ]
 
 
+@pytest.mark.skip("running these on a10g results in process getting killed")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [128])

From 37883e04ed96512c1fd8cebb973b96d5e6e8c052 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 21 Feb 2024 15:50:44 -0800
Subject: [PATCH 102/112] Only import magic_wand if sparsity is enabled (#37)

Tested by making sure magic_wand was uninstalled and this code for a
dense model runs fine:
```python
from vllm import LLM, SamplingParams
model = LLM("nm-testing/opt-125m-pruned2.4", enforce_eager=True)
```

Then testing with a sparse model run:
```python
from vllm import LLM, SamplingParams
model = LLM("nm-testing/opt-125m-pruned2.4", sparsity="sparse_w16a16", enforce_eager=True)
```
output:
```
...
  File "/home/michael/code/neuralmagic-vllm/vllm/model_executor/weight_utils.py", line 93, in get_sparse_config
    from vllm.model_executor.layers.sparsity import get_sparsity_config
  File "/home/michael/code/neuralmagic-vllm/vllm/model_executor/layers/sparsity/__init__.py", line 6, in <module>
    raise ValueError(
ValueError: magic_wand is not available and required for sparsity support. Please install it with `pip install magic_wand`
```
---
 .../layers/parameters/lazy_compressed.py           | 14 +++++++++++++-
 vllm/model_executor/layers/sparsity/__init__.py    | 13 ++++++++++---
 vllm/model_executor/weight_utils.py                |  5 ++---
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/parameters/lazy_compressed.py b/vllm/model_executor/layers/parameters/lazy_compressed.py
index 96e892a03d1fb..a22f718197e10 100644
--- a/vllm/model_executor/layers/parameters/lazy_compressed.py
+++ b/vllm/model_executor/layers/parameters/lazy_compressed.py
@@ -1,9 +1,15 @@
 import numpy
 import torch
 from torch.utils._pytree import tree_map
+import importlib.util
 
 from typing import Type
-from magic_wand import (CompressedStorageFormat, SparseBitmaskStorageFormat)
+
+is_magic_wand_available = importlib.util.find_spec("magic_wand") is not None
+
+# These are types from magic_wand, but we only want to import if required
+CompressedStorageFormat = "CompressedStorageFormat"
+SparseBitmaskStorageFormat = "SparseBitmaskStorageFormat"
 
 
 class LazyCompressedParameter(torch.Tensor):
@@ -14,6 +20,12 @@ def __new__(cls,
                 storage_format_cls: Type[
                     CompressedStorageFormat] = SparseBitmaskStorageFormat,
                 compress_transposed: bool = False):
+
+        if not is_magic_wand_available:
+            raise ValueError(
+                "magic_wand is not available and required for sparsity "
+                "support. Please install it with `pip install magic_wand`")
+
         self = torch.Tensor._make_wrapper_subclass(
             cls,
             size=uncompressed_data.shape,
diff --git a/vllm/model_executor/layers/sparsity/__init__.py b/vllm/model_executor/layers/sparsity/__init__.py
index 82893916fde80..ee430e27a0186 100644
--- a/vllm/model_executor/layers/sparsity/__init__.py
+++ b/vllm/model_executor/layers/sparsity/__init__.py
@@ -1,8 +1,15 @@
 from typing import Type
+import importlib.util
 
-from vllm.model_executor.layers.sparsity.base_config import SparsityConfig
-from vllm.model_executor.layers.sparsity.sparse_w16a16 import SparseW16A16Config
-from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import SemiStructuredSparseW16A16Config
+is_magic_wand_available = importlib.util.find_spec("magic_wand") is not None
+if not is_magic_wand_available:
+    raise ValueError(
+        "magic_wand is not available and required for sparsity "
+        "support. Please install it with `pip install magic_wand`")
+
+from vllm.model_executor.layers.sparsity.base_config import SparsityConfig  # noqa: E402
+from vllm.model_executor.layers.sparsity.sparse_w16a16 import SparseW16A16Config  # noqa: E402
+from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import SemiStructuredSparseW16A16Config  # noqa: E402
 
 _SPARSITY_CONFIG_REGISTRY = {
     "sparse_w16a16": SparseW16A16Config,
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index 23c352c664d4b..49ca62cd606cf 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -17,8 +17,6 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (get_quantization_config,
                                                      QuantizationConfig)
-from vllm.model_executor.layers.sparsity import (get_sparsity_config,
-                                                 SparsityConfig)
 from vllm.model_executor.layers.parameters import LazyCompressedParameter
 
 logger = init_logger(__name__)
@@ -91,7 +89,8 @@ def get_sparse_config(
     model_name_or_path: str,
     hf_config: PretrainedConfig,
     cache_dir: Optional[str] = None,
-) -> SparsityConfig:
+):
+    from vllm.model_executor.layers.sparsity import get_sparsity_config
     sparsity_cls = get_sparsity_config(sparsity)
     hf_sparsity_config = getattr(hf_config, "sparsity_config", None)
     if hf_sparsity_config is not None:

From acf16bf143a86ff97b613ef43e17af7541bc7f14 Mon Sep 17 00:00:00 2001
From: rsnm2 <rshaw@neuralmagic.com>
Date: Thu, 22 Feb 2024 15:11:10 +0000
Subject: [PATCH 103/112] manually reverted requirements to match v0.3.2

---
 requirements-dev.txt    | 14 +++++++-------
 requirements-neuron.txt | 12 ++++++------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index cbf099ab73f50..e188278f0acc9 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,7 +1,7 @@
 # formatting
-ruff==0.1.5
-toml==0.10.2
 yapf==0.32.0
+toml==0.10.2
+ruff==0.1.5
 
 # type checking
 mypy==0.991
@@ -10,12 +10,12 @@ types-requests
 types-setuptools
 
 # testing
+pytest
+pytest-forked
+pytest-asyncio
+httpx
 einops # required for MPT
 flash_attn # required for HuggingFace's llama implementation
-httpx
 openai
-pytest
-pytest-asyncio
-pytest-forked
-ray
 requests
+ray
diff --git a/requirements-neuron.txt b/requirements-neuron.txt
index da9c2de767af1..3f30ed08f037d 100644
--- a/requirements-neuron.txt
+++ b/requirements-neuron.txt
@@ -1,9 +1,9 @@
-aioprometheus[starlette]
-fastapi
-neuronx-cc
-numpy
-pydantic >= 2.0  # Required for OpenAI server.
 sentencepiece  # Required for LLaMA tokenizer.
-torch-neuronx >= 2.1.0
+numpy
 transformers-neuronx >= 0.9.0
+torch-neuronx >= 2.1.0
+neuronx-cc
+fastapi
 uvicorn[standard]
+pydantic >= 2.0  # Required for OpenAI server.
+aioprometheus[starlette]

From 0feedf9221bdcbc2897e761b849ed15e5b89f769 Mon Sep 17 00:00:00 2001
From: rsnm2 <rshaw@neuralmagic.com>
Date: Thu, 22 Feb 2024 15:19:46 +0000
Subject: [PATCH 104/112] reverted requirements

---
 requirements-dev.txt    | 13 ++++++-------
 requirements-neuron.txt | 12 ++++++------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index f19fe6c634918..e188278f0acc9 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,7 +1,7 @@
 # formatting
-ruff==0.1.5
-toml==0.10.2
 yapf==0.32.0
+toml==0.10.2
+ruff==0.1.5
 
 # type checking
 mypy==0.991
@@ -10,13 +10,12 @@ types-requests
 types-setuptools
 
 # testing
+pytest
+pytest-forked
+pytest-asyncio
+httpx
 einops # required for MPT
 flash_attn # required for HuggingFace's llama implementation
-httpx
 openai
-pytest
-pytest-asyncio
-pytest-forked
-ray
 requests
 ray
diff --git a/requirements-neuron.txt b/requirements-neuron.txt
index da9c2de767af1..3f30ed08f037d 100644
--- a/requirements-neuron.txt
+++ b/requirements-neuron.txt
@@ -1,9 +1,9 @@
-aioprometheus[starlette]
-fastapi
-neuronx-cc
-numpy
-pydantic >= 2.0  # Required for OpenAI server.
 sentencepiece  # Required for LLaMA tokenizer.
-torch-neuronx >= 2.1.0
+numpy
 transformers-neuronx >= 0.9.0
+torch-neuronx >= 2.1.0
+neuronx-cc
+fastapi
 uvicorn[standard]
+pydantic >= 2.0  # Required for OpenAI server.
+aioprometheus[starlette]

From ce8164d22dd26fd66b309a60b941d8d54c8ff1d6 Mon Sep 17 00:00:00 2001
From: rsnm2 <rshaw@neuralmagic.com>
Date: Thu, 22 Feb 2024 15:21:44 +0000
Subject: [PATCH 105/112] removed duplicate

---
 vllm/model_executor/weight_utils.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index 375c87658b3b6..49ca62cd606cf 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -98,21 +98,6 @@ def get_sparse_config(
             "Loading hf sparsity config not yet supported")
     return sparsity_cls()
 
-# TODO(woosuk): Move this to other place.
-def get_quant_config(
-    quantization: str,
-    model_name_or_path: str,
-    hf_config: PretrainedConfig,
-    cache_dir: Optional[str] = None,
-):
-    from vllm.model_executor.layers.sparsity import get_sparsity_config
-    sparsity_cls = get_sparsity_config(sparsity)
-    hf_sparsity_config = getattr(hf_config, "sparsity_config", None)
-    if hf_sparsity_config is not None:
-        raise NotImplementedError(
-            "Loading hf sparsity config not yet supported")
-    return sparsity_cls()
-
 
 # TODO(woosuk): Move this to other place.
 def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:

From 166c13b2d728a564a4bf81ffa5eb76558d9f004b Mon Sep 17 00:00:00 2001
From: rsnm2 <rshaw@neuralmagic.com>
Date: Thu, 22 Feb 2024 15:28:05 +0000
Subject: [PATCH 106/112] format

---
 vllm/engine/arg_utils.py            | 14 ++++++++------
 vllm/model_executor/weight_utils.py | 12 ++++--------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f1772b2fb7ee7..fb16fec8a9d14 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -297,12 +297,14 @@ def create_engine_configs(
     ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig,
                DeviceConfig, Optional[LoRAConfig]]:
         device_config = DeviceConfig(self.device)
-        model_config = ModelConfig(
-            self.model, self.tokenizer, self.tokenizer_mode,
-            self.trust_remote_code, self.download_dir, self.load_format,
-            self.dtype, self.seed, self.revision, self.code_revision,
-            self.tokenizer_revision, self.max_model_len, self.sparsity,
-            self.quantization, self.enforce_eager, self.max_context_len_to_capture)
+        model_config = ModelConfig(self.model, self.tokenizer,
+                                   self.tokenizer_mode, self.trust_remote_code,
+                                   self.download_dir, self.load_format,
+                                   self.dtype, self.seed, self.revision,
+                                   self.code_revision, self.tokenizer_revision,
+                                   self.max_model_len, self.sparsity,
+                                   self.quantization, self.enforce_eager,
+                                   self.max_context_len_to_capture)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index 49ca62cd606cf..39d0ab23854e3 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -84,15 +84,11 @@ def convert_bin_to_safetensor_file(
 
 
 # TODO(rib-2): Once we define hf_sparsity_config
-def get_sparse_config(
-    sparsity: str,
-    model_name_or_path: str,
-    hf_config: PretrainedConfig,
-    cache_dir: Optional[str] = None,
-):
+def get_sparse_config(model_config: ModelConfig):
     from vllm.model_executor.layers.sparsity import get_sparsity_config
-    sparsity_cls = get_sparsity_config(sparsity)
-    hf_sparsity_config = getattr(hf_config, "sparsity_config", None)
+    sparsity_cls = get_sparsity_config(model_config.sparsity)
+    hf_sparsity_config = getattr(model_config.hf_config, "sparsity_config",
+                                 None)
     if hf_sparsity_config is not None:
         raise NotImplementedError(
             "Loading hf sparsity config not yet supported")

From 1b395b4e70269585018a4aeb48aa2a3df5c37468 Mon Sep 17 00:00:00 2001
From: rsnm2 <rshaw@neuralmagic.com>
Date: Thu, 22 Feb 2024 15:37:24 +0000
Subject: [PATCH 107/112] added noqa to upstream scripts for linter

---
 benchmarks/backend_request_func.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index e7f74e2feaf86..7fdcb7f5362b1 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -58,13 +58,13 @@ async def async_request_tgi(
         try:
             async with session.post(url=api_url, json=payload) as response:
                 if response.status == 200:
-                    async for data in response.content.iter_any():
+                    async for data in response.content.iter_any():  # noqa
                         if ttft == 0:
                             ttft = time.perf_counter() - st
                             output.ttft = ttft
                     output.latency = time.perf_counter() - st
 
-                    body = data.decode("utf-8").lstrip("data:")
+                    body = data.decode("utf-8").lstrip("data:")     # noqa
                     output.generated_text = json.loads(body)["generated_text"]
                     output.success = True
                 else:
@@ -104,7 +104,7 @@ async def async_request_vllm(
         try:
             async with session.post(url=api_url, json=payload) as response:
                 if response.status == 200:
-                    async for data in response.content.iter_any():
+                    async for data in response.content.iter_any():  # noqa
                         if ttft == 0:
                             ttft = time.perf_counter() - st
                             output.ttft = ttft
@@ -152,13 +152,13 @@ async def async_request_trt_llm(
         try:
             async with session.post(url=api_url, json=payload) as resp:
                 if resp.status == 200:
-                    async for data in resp.content.iter_any():
+                    async for data in resp.content.iter_any():  # noqa
                         if ttft == 0:
                             ttft = time.perf_counter() - st
                             output.ttft = ttft
                     output.latency = time.perf_counter() - st
 
-                    body = data.decode("utf-8").lstrip("data:")
+                    body = data.decode("utf-8").lstrip("data:") # noqa
                     output.generated_text = json.loads(body)["text_output"]
                     output.success = True
 
@@ -255,7 +255,7 @@ async def async_request_openai_completions(
                         if not chunk:
                             continue
 
-                        chunk = chunk.decode("utf-8").lstrip("data: ")
+                        chunk = chunk.decode("utf-8").lstrip("data: ") # noqa
                         if chunk == "[DONE]":
                             latency = time.perf_counter() - st
                         else:

From 8d935bef51dc3dcf5b4d4779ac21e1a1128f29d2 Mon Sep 17 00:00:00 2001
From: rsnm2 <rshaw@neuralmagic.com>
Date: Thu, 22 Feb 2024 15:39:29 +0000
Subject: [PATCH 108/112] format

---
 benchmarks/backend_request_func.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 7fdcb7f5362b1..df5e55c0d05ff 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -64,7 +64,7 @@ async def async_request_tgi(
                             output.ttft = ttft
                     output.latency = time.perf_counter() - st
 
-                    body = data.decode("utf-8").lstrip("data:")     # noqa
+                    body = data.decode("utf-8").lstrip("data:")  # noqa
                     output.generated_text = json.loads(body)["generated_text"]
                     output.success = True
                 else:
@@ -158,7 +158,7 @@ async def async_request_trt_llm(
                             output.ttft = ttft
                     output.latency = time.perf_counter() - st
 
-                    body = data.decode("utf-8").lstrip("data:") # noqa
+                    body = data.decode("utf-8").lstrip("data:")  # noqa
                     output.generated_text = json.loads(body)["text_output"]
                     output.success = True
 
@@ -255,7 +255,7 @@ async def async_request_openai_completions(
                         if not chunk:
                             continue
 
-                        chunk = chunk.decode("utf-8").lstrip("data: ") # noqa
+                        chunk = chunk.decode("utf-8").lstrip("data: ")  # noqa
                         if chunk == "[DONE]":
                             latency = time.perf_counter() - st
                         else:

From acb861598182d75859fe59a607eb6a36b5429991 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Thu, 22 Feb 2024 12:19:00 -0500
Subject: [PATCH 109/112] Sparsity fix (#40)

---
 ...e_semi_structured_sparse.py => offline_inference_24.py} | 0
 examples/offline_inference_sparse.py                       | 7 +++++++
 vllm/engine/arg_utils.py                                   | 4 ++--
 vllm/model_executor/model_loader.py                        | 5 +----
 4 files changed, 10 insertions(+), 6 deletions(-)
 rename examples/{offline_inference_semi_structured_sparse.py => offline_inference_24.py} (100%)
 create mode 100644 examples/offline_inference_sparse.py

diff --git a/examples/offline_inference_semi_structured_sparse.py b/examples/offline_inference_24.py
similarity index 100%
rename from examples/offline_inference_semi_structured_sparse.py
rename to examples/offline_inference_24.py
diff --git a/examples/offline_inference_sparse.py b/examples/offline_inference_sparse.py
new file mode 100644
index 0000000000000..b35ac0080e5ef
--- /dev/null
+++ b/examples/offline_inference_sparse.py
@@ -0,0 +1,7 @@
+from vllm import LLM, SamplingParams
+
+model = LLM("nm-testing/TinyLlama-1.1B-Chat-v1.0-pruned2.4", sparsity="sparse_w16a16")
+
+sampling_params = SamplingParams(max_tokens=100, temperature=0)
+outputs = model.generate("Hello my name is", sampling_params=sampling_params)
+print(outputs[0].outputs[0].text)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index fb16fec8a9d14..c4084deeb3bba 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -302,8 +302,8 @@ def create_engine_configs(
                                    self.download_dir, self.load_format,
                                    self.dtype, self.seed, self.revision,
                                    self.code_revision, self.tokenizer_revision,
-                                   self.max_model_len, self.sparsity,
-                                   self.quantization, self.enforce_eager,
+                                   self.max_model_len, self.quantization, 
+                                   self.sparsity, self.enforce_eager,
                                    self.max_context_len_to_capture)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
index fd1757e1f97cf..4b470558b1494 100644
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@@ -63,10 +63,7 @@ def get_model(model_config: ModelConfig,
                 f"{supported_dtypes}")
         linear_method = quant_config.get_linear_method()
     if model_config.sparsity is not None:
-        sparse_config = get_sparse_config(model_config.sparsity,
-                                          model_config.model,
-                                          model_config.hf_config,
-                                          model_config.download_dir)
+        sparse_config = get_sparse_config(model_config)
         capability = torch.cuda.get_device_capability()
         capability = capability[0] * 10 + capability[1]
         if capability < sparse_config.get_min_capability():

From 4b44479aa13a45f69d1742e412297ae06fba363f Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Thu, 22 Feb 2024 17:46:19 -0500
Subject: [PATCH 110/112] Rs/marlin downstream v0.3.2 (#43)

Co-authored-by: Andrew Feldman <afeldman@neuralmagic.com>
Co-authored-by: Robert Shaw <114415538+rib-2@users.noreply.github.com>
Co-authored-by: alexm <alexm@neuralmagic.com>
---
 csrc/ops.h                                    |    9 +
 csrc/pybind.cpp                               |    1 +
 .../quantization/marlin/marlin_cuda_kernel.cu | 1144 +++++++++++++++++
 examples/offline_inference_sparse.py          |    3 +-
 requirements-dev.txt                          |    1 +
 setup.py                                      |    2 +
 tests/conftest.py                             |   32 +
 tests/models/compare_utils.py                 |   29 +
 tests/models/test_marlin.py                   |   84 ++
 vllm/config.py                                |   20 +-
 vllm/engine/arg_utils.py                      |   14 +-
 vllm/model_executor/layers/linear.py          |   28 +
 .../layers/quantization/__init__.py           |    2 +
 .../layers/quantization/marlin.py             |  207 +++
 .../layers/sparsity/sparse_w16a16.py          |    4 +-
 15 files changed, 1563 insertions(+), 17 deletions(-)
 create mode 100644 csrc/quantization/marlin/marlin_cuda_kernel.cu
 create mode 100644 tests/models/compare_utils.py
 create mode 100644 tests/models/test_marlin.py
 create mode 100644 vllm/model_executor/layers/quantization/marlin.py

diff --git a/csrc/ops.h b/csrc/ops.h
index 2bcd0c2efc5c6..4c66ba184cb85 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -80,6 +80,15 @@ torch::Tensor awq_dequantize(
     int split_k_iters,
     int thx,
     int thy);
+
+torch::Tensor marlin_gemm(
+  torch::Tensor &a,
+  torch::Tensor &b_q_weight,
+  torch::Tensor &b_scales,
+  torch::Tensor &workspace,
+  int64_t size_m,
+  int64_t size_n,
+  int64_t size_k);
 #endif
 
 void squeezellm_gemm(
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index b36d259697167..36572a90e5a56 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -52,6 +52,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 #ifndef USE_ROCM
   ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
   ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
+  ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");
 #endif
   ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
   ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
diff --git a/csrc/quantization/marlin/marlin_cuda_kernel.cu b/csrc/quantization/marlin/marlin_cuda_kernel.cu
new file mode 100644
index 0000000000000..3bc4b5576f79f
--- /dev/null
+++ b/csrc/quantization/marlin/marlin_cuda_kernel.cu
@@ -0,0 +1,1144 @@
+/*
+ * Copyright (C) Marlin.2024 Elias Frantar (elias.frantar@ist.ac.at)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/extension.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+template <typename T> inline std::string str(T x) { return std::to_string(x); }
+
+namespace marlin {
+
+constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+
+// Instances of `Vec` are used to organize groups of >>registers<<, as needed
+// for instance as inputs to tensor core operations. Consequently, all
+// corresponding index accesses must be compile-time constants, which is why we
+// extensively use `#pragma unroll` throughout the kernel code to guarantee
+// this.
+template <typename T, int n> struct Vec {
+  T elems[n];
+  __device__ T &operator[](int i) { return elems[i]; }
+};
+
+using I4 = Vec<int, 4>;
+
+// Matrix fragments for tensor core instructions; their precise layout is
+// documented here:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+using FragA = Vec<half2, 4>;
+using FragB = Vec<half2, 2>;
+using FragC = Vec<float, 4>;
+using FragS = Vec<half2, 1>; // quantization scales
+
+// Predicated asynchronous global->shared copy; used for inputs A where we apply
+// predication to handle batchsizes that are not multiples of 16.
+__device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("{\n"
+               "   .reg .pred p;\n"
+               "   setp.ne.b32 p, %0, 0;\n"
+               "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+               "}\n" ::"r"((int)pred),
+               "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+// Asynchronous global->shared copy with a cache hint indicating that the values
+// may be evicted immediately; used for quantized weights B, which are only
+// accessed precisely once and should thus not pollute the L2 cache which we
+// need for inputs A and outputs C.
+__device__ inline void cp_async4_stream(void *smem_ptr, const void *glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .b64 p;\n"
+      "   createpolicy.fractional.L2::evict_first.b64 p, 1.0;"
+      "   cp.async.cg.shared.global.L2::cache_hint [%0], [%1], %2, p;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+// Async copy fence.
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+// Wait until at most `n` async copy stages are still pending.
+template <int n> __device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+__device__ inline void mma(const FragA &a_frag, const FragB &frag_b,
+                           FragC &frag_c) {
+  const uint32_t *a = reinterpret_cast<const uint32_t *>(&a_frag);
+  const uint32_t *b = reinterpret_cast<const uint32_t *>(&frag_b);
+  float *c = reinterpret_cast<float *>(&frag_c);
+  asm volatile("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+               "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+               : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+               : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]),
+                 "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+__device__ inline void ldsm4(FragA &frag_a, const void *smem_ptr) {
+  uint32_t *a = reinterpret_cast<uint32_t *>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+               : "r"(smem));
+}
+
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut> __device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+// values. We mostly follow the strategy in the link below, with some small
+// changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+__device__ inline FragB dequant(int q) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2 *>(&lo),
+                      *reinterpret_cast<const half2 *>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2 *>(&hi),
+                      *reinterpret_cast<const half2 *>(&MUL),
+                      *reinterpret_cast<const half2 *>(&ADD));
+  return frag_b;
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+__device__ inline void scale(FragB &frag_b, FragS &frag_s, int i) {
+  half2 s = __half2half2(reinterpret_cast<__half *>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int *lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int *lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+
+template <const int threads,         // number of threads in a threadblock
+          const int thread_m_blocks, // number of 16x16 blocks in the m
+                                     // dimension (batchsize) of the threadblock
+          const int thread_n_blocks, // same for n dimension (output)
+          const int thread_k_blocks, // same for k dimension (reduction)
+          const int stages, // number of stages for the async global->shared
+                            // fetch pipeline
+          const int group_blocks = -1 // number of consecutive 16x16 blocks with
+                                      // a separate quantization scale
+          >
+__global__ void
+Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+       const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn
+       int4 *__restrict__ C,       // fp16 output buffer of shape mxn
+       const int4
+           *__restrict__ s, // fp16 quantization scales of shape (k/groupsize)xn
+       int prob_m,          // batch dimension m
+       int prob_n,          // output dimension n
+       int prob_k,          // reduction dimension k
+       int *locks           // extra global storage for barrier synchronization
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
+  // Ensure that the number of tiles in each stripe is a multiple of the
+  // groupsize; this avoids an annoying special case where a stripe starts in
+  // the middle of group.
+  if (group_blocks != -1)
+    iters = (group_blocks / thread_k_blocks) *
+            ceildiv(iters, (group_blocks / thread_k_blocks));
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters; // number of threadblock tiles in the current slice
+  int slice_count =
+      0;         // total number of active threadblocks in the current slice
+  int slice_idx; // index of threadblock in current slice; numbered bottom to
+                 // top
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
+    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel)
+      slice_iters = 0;
+    if (slice_iters == 0)
+      return;
+    if (slice_row + slice_iters > k_tiles)
+      slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = ceildiv(k_tiles - col_off, iters);
+      if (col_off > 0)
+        slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0)
+          slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      A += 16 * thread_m_blocks * prob_k / 8;
+      C += 16 * thread_m_blocks * prob_n / 8;
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+
+  int a_gl_stride = prob_k / 8; // stride of the A matrix in global memory
+  // We typically use `constexpr` to indicate that this value is a compile-time
+  // constant
+  constexpr int a_sh_stride =
+      16 * thread_k_blocks / 8; // stride of an A matrix tile in shared memory
+  constexpr int a_gl_rd_delta_o =
+      16 * thread_k_blocks /
+      8; // delta between subsequent A tiles in global memory
+  int a_gl_rd_delta_i =
+      a_gl_stride *
+      (threads / a_gl_rd_delta_o); // between subsequent accesses within a tile
+  constexpr int a_sh_wr_delta =
+      a_sh_stride * (threads / a_gl_rd_delta_o); // between shared memory writes
+  constexpr int a_sh_rd_delta_o =
+      2 * ((threads / 32) /
+           (thread_n_blocks / 4)); // between shared memory tile reads
+  constexpr int a_sh_rd_delta_i =
+      a_sh_stride * 16; // within a shared memory tile
+  constexpr int a_sh_stage =
+      a_sh_stride * (16 * thread_m_blocks); // overall size of a tile
+  constexpr int a_sh_wr_iters =
+      ceildiv(a_sh_stage,
+              a_sh_wr_delta); // number of shared write iterations for a tile
+
+  int b_gl_stride = 16 * prob_n / 32;
+  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
+  constexpr int b_sh_wr_delta = threads;
+  constexpr int b_sh_rd_delta = threads;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_sh_stage = s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd =
+      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x;
+  int b_sh_rd = threadIdx.x;
+
+  int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                s_sh_stride * slice_col + threadIdx.x;
+  int s_sh_wr = threadIdx.x;
+  int s_sh_rd;
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  if (group_blocks != -1)
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+  else
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) % 4;
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+#pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+#pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+#pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+#pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependicies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4 *B_ptr[b_sh_wr_iters];
+#pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  int4 *sh_a = sh;
+  int4 *sh_b = sh_a + (stages * a_sh_stage);
+  int4 *sh_s = sh_b + (stages * b_sh_stage);
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+#pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float *>(frag_c)[i] = 0;
+  };
+
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4 *sh_a_stage = sh_a + a_sh_stage * pipe;
+#pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        cp_async4_pred(
+            &sh_a_stage[a_sh_wr_trans[i]],
+            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+            a_sh_wr_pred[i]);
+      }
+      int4 *sh_b_stage = sh_b + b_sh_stage * pipe;
+#pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+        cp_async4_stream(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+      // Only fetch scales if this tile starts a new group
+      if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
+        int4 *sh_s_stage = sh_s + s_sh_stage * pipe;
+        if (s_sh_wr_pred)
+          cp_async4_stream(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
+        s_gl_rd += s_gl_rd_delta;
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    // It may seem inefficient that we reload the groups for every sub-tile;
+    // however, this does not seem to be a significant bottleneck, while some
+    // theoretically better attempts have lead to bad instruction ordering by
+    // the compiler and correspondingly a noticable drop in performance.
+    if (group_blocks != -1) {
+      int4 *sh_s_stage =
+          sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+                               (pipe / (group_blocks / thread_k_blocks)));
+      reinterpret_cast<int4 *>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+    }
+    int4 *sh_a_stage = sh_a + a_sh_stage * pipe;
+#pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4 *sh_b_stage = sh_b + b_sh_stage * pipe;
+    frag_b_quant[k % 2] = *reinterpret_cast<I4 *>(
+        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+// We have the m dimension as the inner loop in order to encourage overlapping
+// dequantization and matmul operations.
+#pragma unroll
+    for (int j = 0; j < 4; j++) {
+      int b_quant = frag_b_quant[k % 2][j];
+      int b_quant_shift = b_quant >> 8;
+      FragB frag_b0 = dequant(b_quant);
+      // If there are no groups, we can just scale the final output once and can
+      // avoid doing so for each weight.
+      if (group_blocks != -1)
+        scale(frag_b0, frag_s[k % 2][j], 0);
+      FragB frag_b1 = dequant(b_quant_shift);
+      if (group_blocks != -1)
+        scale(frag_b1, frag_s[k % 2][j], 1);
+#pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
+        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride;
+      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
+                      (threadIdx.x % b_sh_stride);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+#pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+#pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+#pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float *c_rd = reinterpret_cast<float *>(
+                    &sh[red_sh_delta * j + red_sh_rd]);
+                float *c_wr = reinterpret_cast<float *>(&sh[red_sh_wr]);
+#pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC *>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4 *>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+#pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            float *c_rd =
+                reinterpret_cast<float *>(&sh[red_sh_delta * i + red_sh_rd]);
+#pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC *>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped partioning
+  // minimizes the number of such reductions and our outputs are usually rather
+  // small, we perform this reduction serially in L2 cache.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 8;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 4 * (active_threads / 32);
+      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads;
+      int c_sh_wr = threadIdx.x;
+
+      int row = (threadIdx.x % 32) / 4;
+
+      if (!first) {
+// Interestingly, doing direct global accesses here really seems to mess up the
+// compiler and lead to slowdowns, hence we also use async-copies even though
+// these fetches are not actually asynchronous.
+#pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i],
+                         &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                            c_gl_wr_delta_i * (i % 2)],
+                         i < (thread_m_blocks - 1) * 4 ||
+                             8 * (i / 2) + row < prob_m);
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+#pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
+          if (!first) {
+            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+#pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<float *>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
+                  __half2float(reinterpret_cast<__half *>(&c_red)[j]);
+            }
+          }
+          if (!last) {
+            int4 c;
+#pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<__half *>(&c)[j] =
+                  __float2half(reinterpret_cast<float *>(
+                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
+            }
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
+                c;
+          }
+        }
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta =
+        c_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr =
+        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+    c_sh_wr += 32 * (threadIdx.x / 32);
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    int c_gl_wr_end = c_gl_stride * prob_m;
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS &s) {
+      half2 res = __halves2half2(__float2half(c0), __float2half(c1));
+      if (group_blocks ==
+          -1) // for per-column quantization we finally apply the scale here
+        res = __hmul2(res, s[0]);
+      ((half2 *)sh)[idx] = res;
+    };
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+#pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+#pragma unroll
+        for (int j = 0; j < 4; j++) {
+          int wr = c_sh_wr + 8 * j;
+          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (int i = 0;
+         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (c_gl_wr < c_gl_wr_end) {
+        C[c_gl_wr] = sh[c_sh_rd];
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+#pragma unroll
+    for (int i = 0; i < stages - 1; i++)
+      fetch_to_shared(i, i, i < slice_iters);
+    zero_accums();
+    wait_for_stage();
+    fetch_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+  };
+  start_pipes();
+
+  // Main loop.
+  while (slice_iters) {
+// We unroll over both the global fetch and the register load pipeline to ensure
+// all shared memory accesses are static. Note that both pipelines have even
+// length meaning that the next iteration will always start at index 0.
+#pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+#pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0)
+        break;
+    }
+    a_gl_rd += a_gl_rd_delta_o * stages;
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compliation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if (group_blocks == -1 && last) {
+        if (s_sh_wr_pred)
+          cp_async4_stream(&sh_s[s_sh_wr], &s[s_gl_rd]);
+        cp_async_fence();
+      }
+      thread_block_reduce();
+      if (group_blocks == -1 && last) {
+        cp_async_wait<0>();
+        __syncthreads();
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+          reinterpret_cast<int4 *>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+          reinterpret_cast<int4 *>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+        }
+      }
+      if (slice_count > 1) { // only globally reduce if there is more than one
+                             // block in a slice
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last) // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+#pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+#pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++)
+            B_ptr[i] -= b_gl_stride;
+        }
+        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+        start_pipes();
+      }
+    }
+  }
+}
+
+#else
+
+template <const int threads,         // number of threads in a threadblock
+          const int thread_m_blocks, // number of 16x16 blocks in the m
+                                     // dimension (batchsize) of the threadblock
+          const int thread_n_blocks, // same for n dimension (output)
+          const int thread_k_blocks, // same for k dimension (reduction)
+          const int stages, // number of stages for the async global->shared
+                            // fetch pipeline
+          const int group_blocks = -1 // number of consecutive 16x16 blocks with
+                                      // a separate quantization scale
+          >
+__global__ void
+Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+       const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn
+       int4 *__restrict__ C,       // fp16 output buffer of shape mxn
+       const int4
+           *__restrict__ s, // fp16 quantization scales of shape (k/groupsize)xn
+       int prob_m,          // batch dimension m
+       int prob_n,          // output dimension n
+       int prob_k,          // reduction dimension k
+       int *locks           // extra global storage for barrier synchronization
+) {
+  // Marlin is not implemented yet for SM < 8.0
+  assert(false);
+  return;
+}
+
+#endif
+
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+const int USER_THREADS =
+    256;              // Note: This is only used with user-provided thread_k/n
+const int STAGES = 4; // 4 pipeline stages fit into shared memory
+const int SHARED_MEM =
+    96 * 1024; // max shared memory on compute capability 8.6 (< 8.0)
+
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+
+static constexpr int tile_size = 16;
+static constexpr int max_par = 16;
+
+static constexpr int pack_factor_4bit =
+    8; // We have 8 4-bit vals inside a 32 bit
+
+#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,           \
+                  GROUP_BLOCKS, NUM_THREADS)                                   \
+  else if (thread_m_blocks == THREAD_M_BLOCKS &&                               \
+           thread_n_blocks == THREAD_N_BLOCKS &&                               \
+           thread_k_blocks == THREAD_K_BLOCKS &&                               \
+           group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {       \
+    cudaFuncSetAttribute(Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
+                                THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,        \
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,          \
+                         SHARED_MEM);                                          \
+    Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,     \
+           STAGES, GROUP_BLOCKS><<<blocks, NUM_THREADS, SHARED_MEM, stream>>>( \
+        A_ptr, B_ptr, C_ptr, s_ptr, prob_m, prob_n, prob_k, locks);            \
+  }
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256}, // Default
+    {128, 64, 128},  // Reduce N 2X, same K
+    {64, 256, 256},  // Reduce K 2X, increase N 2X
+    {64, 128, 128},  // Reduce K 2X, same N
+};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},  // Default
+    {128, 128, 256}, // Reduce N 2X, increase K 2X
+    {64, 128, 128},  // Reduce N 2X, same K
+    {128, 64, 128},  // Reduce N 4X, increase K 2X
+};
+
+bool is_valid_config(thread_config_t const &th_config, int prob_m, int prob_n,
+                     int prob_k) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // thread_k can be only 128 or 64 (because it must be less than groupsize
+  // which is 128)
+  if (th_config.thread_k != 128 && th_config.thread_k != 64) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  return true;
+}
+
+thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
+
+  if (prob_m <= 16) {
+    for (auto th_config : small_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+
+  } else {
+    for (auto th_config : large_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+  }
+
+  return thread_config_t{-1, -1, -1};
+}
+
+#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS)                               \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS)                            \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)                             \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS)                            \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)                             \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS)                            \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)                             \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS)                            \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)                             \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS)                            \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)
+
+void marlin_cuda(const void *A, const void *B, void *C, void *s, int prob_m,
+                 int prob_n, int prob_k, void *workspace, int groupsize = -1,
+                 int dev = 0, cudaStream_t stream = 0, int thread_k = -1,
+                 int thread_n = -1, int sms = -1, int max_par = 16) {
+  int tot_m = prob_m;
+  int tot_m_blocks = ceildiv(tot_m, 16);
+  int pad = 16 * tot_m_blocks - tot_m;
+
+  if (sms == -1)
+    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+
+  // Set thread config
+  thread_config_t th_config;
+  if (thread_k != -1 && thread_n != -1) {
+    // User-defined config
+    th_config = thread_config_t{thread_k, thread_n, USER_THREADS};
+  } else {
+    // Auto config
+    th_config = determine_thread_config(prob_m, prob_n, prob_k);
+  }
+
+  if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+    throw std::runtime_error(
+        "Invalid thread config: thread_k = " + str(th_config.thread_k) +
+        ", thread_n = " + str(th_config.thread_n) +
+        ", num_threads = " + str(th_config.num_threads) + " for MKN = [" +
+        str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]");
+  }
+
+  // Uncomment for debug
+  // std::cout << "Using thread_config: thread_k = " + str(th_config.thread_k) +
+  //                  ", thread_n = " + str(th_config.thread_n) +
+  //                  ", num_threads = " + str(th_config.num_threads) + " for
+  //                  MKN = [" + str(prob_m) +
+  //                  ", " + str(prob_k) + ", " + str(prob_n) + "]\n";
+
+  int num_threads = th_config.num_threads;
+  thread_k = th_config.thread_k;
+  thread_n = th_config.thread_n;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
+  int blocks = sms;
+
+  if (prob_m == 0 || prob_n == 0 || prob_k == 0) {
+    return;
+  }
+
+  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+              " is not divisible by thread_n = ", thread_n);
+  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+              " is not divisible by thread_k = ", thread_k);
+  if (group_blocks != -1) {
+    TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                " is not divisible by group_blocks = ", group_blocks);
+  }
+
+  const int4 *A_ptr = (const int4 *)A;
+  const int4 *B_ptr = (const int4 *)B;
+  int4 *C_ptr = (int4 *)C;
+  const int4 *s_ptr = (const int4 *)s;
+
+  int *locks = (int *)workspace;
+
+  for (int i = 0; i < tot_m_blocks; i += 4) {
+    int thread_m_blocks = tot_m_blocks - i;
+    prob_m = tot_m - 16 * i;
+    int par = 1;
+    if (thread_m_blocks > 4) {
+      // Note that parallel > 1 currently only works for inputs without any
+      // padding
+      par = (16 * thread_m_blocks - pad) / 64;
+      if (par > max_par)
+        par = max_par;
+      prob_m = 64 * par;
+      i += 4 * (par - 1);
+      thread_m_blocks = 4;
+    }
+
+    // For compilation speed, we only define the kernel configurations that have
+    // seemed useful (in terms of performance) in our testing, however many more
+    // are, in principle, possible.
+    if (false) {
+    }
+    CALL_IF(8, 8, 256)
+    CALL_IF(16, 4, 256)
+    CALL_IF(8, 4, 128)
+    CALL_IF(4, 8, 128)
+    else {
+      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
+                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
+                               ", groupsize = " + str(groupsize) +
+                               ", thread_m_blocks = " + str(thread_m_blocks) +
+                               ", thread_n_blocks = " + str(thread_n_blocks) +
+                               ", thread_k_blocks = " + str(thread_k_blocks));
+    }
+
+    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
+    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
+  }
+}
+
+} // namespace marlin
+
+torch::Tensor marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+                          torch::Tensor &b_scales, torch::Tensor &workspace,
+                          int64_t size_m, int64_t size_n, int64_t size_k) {
+
+  // Verify M
+  TORCH_CHECK(size_m == a.size(0),
+              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
+                  ", size_m = " + str(size_m));
+
+  // Verify K
+  TORCH_CHECK(size_k == a.size(1),
+              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
+                  ", size_k = " + str(size_k));
+  TORCH_CHECK(size_k % marlin::tile_size == 0,
+              "size_k = " + str(size_k) +
+                  " is not divisible by tile_size = " + str(marlin::tile_size));
+  TORCH_CHECK((size_k / marlin::tile_size) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = " +
+                  str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
+                  ", tile_size = " + str(marlin::tile_size));
+
+  // Verify N
+  TORCH_CHECK(b_scales.size(1) == size_n,
+              "b_scales.size(1) = " + str(b_scales.size(1)) +
+                  ", size_n = " + str(size_n));
+  TORCH_CHECK(b_q_weight.size(1) % marlin::tile_size == 0,
+              "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
+                  " is not divisible by tile_size = " + str(marlin::tile_size));
+
+  int actual_size_n =
+      (b_q_weight.size(1) / marlin::tile_size) * marlin::pack_factor_4bit;
+  TORCH_CHECK(size_n == actual_size_n,
+              "size_n = " + str(size_n) +
+                  ", actual_size_n = " + str(actual_size_n));
+
+  // Verify A device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  // Verify B device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  // Verify scales device and strides
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  // Alloc C matrix
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c = torch::empty({size_m, size_n}, options);
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
+  int sms = -1;
+
+  // Detect groupsize
+  if (b_scales.size(0) != 1) {
+    TORCH_CHECK(size_k % b_scales.size(0) == 0,
+                "size_k = " + str(size_k) +
+                    ", is not divisible by b_scales.size(0) = " +
+                    str(b_scales.size(0)));
+  }
+  int groupsize = b_scales.size(0) == 1 ? -1 : size_k / b_scales.size(0);
+
+  // Verify groupsize
+  TORCH_CHECK(groupsize == -1 || groupsize == 128,
+              "Unexpected groupsize = " + str(groupsize));
+
+  // Verify workspace size
+  TORCH_CHECK(
+      size_n % marlin::min_thread_n == 0,
+      "size_n = " + str(size_n) +
+          ", is not divisible by min_thread_n = " + str(marlin::min_thread_n));
+  int min_workspace_size = (size_n / marlin::min_thread_n) * marlin::max_par;
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = " + str(workspace.numel()) +
+                  " is below min_workspace_size = " + str(min_workspace_size));
+
+  int dev = a.get_device();
+  marlin::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(),
+                      b_scales.data_ptr(), size_m, size_n, size_k,
+                      workspace.data_ptr(), groupsize, dev,
+                      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n,
+                      sms, marlin::max_par);
+
+  return c;
+}
diff --git a/examples/offline_inference_sparse.py b/examples/offline_inference_sparse.py
index b35ac0080e5ef..38ff25a3f4e47 100644
--- a/examples/offline_inference_sparse.py
+++ b/examples/offline_inference_sparse.py
@@ -1,6 +1,7 @@
 from vllm import LLM, SamplingParams
 
-model = LLM("nm-testing/TinyLlama-1.1B-Chat-v1.0-pruned2.4", sparsity="sparse_w16a16")
+model = LLM("nm-testing/TinyLlama-1.1B-Chat-v1.0-pruned2.4",
+            sparsity="sparse_w16a16")
 
 sampling_params = SamplingParams(max_tokens=100, temperature=0)
 outputs = model.generate("Hello my name is", sampling_params=sampling_params)
diff --git a/requirements-dev.txt b/requirements-dev.txt
index e188278f0acc9..27811c61c315a 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -13,6 +13,7 @@ types-setuptools
 pytest
 pytest-forked
 pytest-asyncio
+pytest-rerunfailures
 httpx
 einops # required for MPT
 flash_attn # required for HuggingFace's llama implementation
diff --git a/setup.py b/setup.py
index 8fcb86394f76d..a0ce921433016 100644
--- a/setup.py
+++ b/setup.py
@@ -342,6 +342,8 @@ def get_torch_arch_list() -> Set[str]:
 
 if _is_cuda():
     vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
+    vllm_extension_sources.append(
+        "csrc/quantization/marlin/marlin_cuda_kernel.cu")
     vllm_extension_sources.append("csrc/custom_all_reduce.cu")
 
     # Add MoE kernels.
diff --git a/tests/conftest.py b/tests/conftest.py
index 6af9b36b6febe..85cfed52920d6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -197,6 +197,24 @@ def generate(
             outputs.append((req_sample_output_ids, req_sample_output_strs))
         return outputs
 
+    def generate_w_logprobs(
+        self,
+        prompts: List[str],
+        sampling_params: SamplingParams,
+    ) -> List[Tuple[List[int], str]]:
+        assert sampling_params.logprobs is not None
+
+        req_outputs = self.model.generate(prompts,
+                                          sampling_params=sampling_params)
+        outputs = []
+        for req_output in req_outputs:
+            for sample in req_output.outputs:
+                output_str = sample.text
+                output_ids = sample.token_ids
+                output_logprobs = sample.logprobs
+            outputs.append((output_ids, output_str, output_logprobs))
+        return outputs
+
     def generate_greedy(
         self,
         prompts: List[str],
@@ -207,6 +225,20 @@ def generate_greedy(
         return [(output_ids[0], output_str[0])
                 for output_ids, output_str in outputs]
 
+    def generate_greedy_logprobs(
+        self,
+        prompts: List[str],
+        max_tokens: int,
+        num_logprobs: int,
+    ) -> List[Tuple[List[int], str]]:
+        greedy_logprobs_params = SamplingParams(temperature=0.0,
+                                                max_tokens=max_tokens,
+                                                logprobs=num_logprobs)
+        outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params)
+
+        return [(output_ids, output_str, output_logprobs)
+                for output_ids, output_str, output_logprobs in outputs]
+
     def generate_beam_search(
         self,
         prompts: List[str],
diff --git a/tests/models/compare_utils.py b/tests/models/compare_utils.py
new file mode 100644
index 0000000000000..e8d888026c68a
--- /dev/null
+++ b/tests/models/compare_utils.py
@@ -0,0 +1,29 @@
+"""Compare the logprobs of two sequences generated by different models, which should
+be similar but not necessarily equal.
+"""
+
+
+def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1):
+    # Loop through resonses to each prompt.
+    for prompt_idx, (outputs_0,
+                     outputs_1) in enumerate(zip(outputs_0_lst,
+                                                 outputs_1_lst)):
+        output_ids_0, output_str_0, logprobs_0 = outputs_0
+        output_ids_1, output_str_1, logprobs_1 = outputs_1
+
+        # Loop through generated tokens.
+        for idx, (output_id_0,
+                  output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
+
+            # If generated tokens don't match ...
+            if output_id_0 != output_id_1:
+                # ... each predicted token must be in top N logprobs of the other's
+                assert output_id_0 in logprobs_1[idx], (
+                    f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
+                )
+                assert output_id_1 in logprobs_0[idx], (
+                    f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
+                )
+
+                # Break out since sequences will now diverge.
+                break
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
new file mode 100644
index 0000000000000..409a172df8516
--- /dev/null
+++ b/tests/models/test_marlin.py
@@ -0,0 +1,84 @@
+"""Compare the outputs of a GPTQ model to a Marlin model.
+
+Note: GPTQ and Marlin do not have bitwise correctness. 
+As a result, in this test, we just confirm that the top selected tokens of the 
+Marlin/GPTQ models are in the top 3 selections of eachother.
+
+Note: Marlin internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for Marlin. As a result, we re-run the test
+up to 3 times to see if we pass.
+
+Run `pytest tests/models/test_marlin.py --forked`.
+"""
+
+import pytest
+import torch
+from compare_utils import check_logprobs_close
+from dataclasses import dataclass
+from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+marlin_not_supported = (
+    capability < _QUANTIZATION_CONFIG_REGISTRY["marlin"].get_min_capability())
+
+
+@dataclass
+class ModelPair:
+    model_marlin: str
+    model_gptq: str
+
+
+model_pairs = [
+    ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128",
+              model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"),
+    ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin",
+              model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"),
+    ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin",
+              model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq")
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(marlin_not_supported,
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [3])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    marlin_model = vllm_runner(model_pair.model_marlin, dtype=dtype)
+    marlin_outputs = marlin_model.generate_greedy_logprobs(
+        example_prompts, max_tokens, num_logprobs)
+
+    # Note: not sure why, but deleting just the model on Ada Lovelace
+    #   does not free the GPU memory. On Ampere, deleting just the model
+    #   frees the memory.
+    del marlin_model.model.llm_engine.driver_worker
+    del marlin_model
+
+    gptq_model = vllm_runner(model_pair.model_gptq, dtype=dtype)
+    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
+                                                       max_tokens,
+                                                       num_logprobs)
+
+    # Note: not sure why, but deleting just the model on Ada Lovelace
+    #   does not free the GPU memory. On Ampere, deleting just the model
+    #   frees the memory.
+    del gptq_model.model.llm_engine.driver_worker
+    del gptq_model
+
+    # loop through the prompts
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=marlin_outputs,
+        name_0="gptq",
+        name_1="marlin",
+    )
diff --git a/vllm/config.py b/vllm/config.py
index b10817c93d69f..acaff0e559d88 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -160,7 +160,7 @@ def _verify_tokenizer_mode(self) -> None:
     def _verify_sparsity(self) -> None:
         supported_sparsity = ["sparse_w16a16", "semi_structured_sparse_w16a16"]
 
-        if self.quantization is not None:
+        if self.quantization is not None and self.sparsity is not None:
             raise ValueError("Both sparsity and quantization detected. Only "
                              "one or the other is supported at a time.")
 
@@ -182,8 +182,8 @@ def _verify_sparsity(self) -> None:
                     f"({self.sparsity}).")
 
     def _verify_quantization(self) -> None:
-        supported_quantization = ["awq", "gptq", "squeezellm"]
-        rocm_not_supported_quantization = ["awq"]
+        supported_quantization = ["awq", "gptq", "squeezellm", "marlin"]
+        rocm_not_supported_quantization = ["awq", "marlin"]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
@@ -191,6 +191,12 @@ def _verify_quantization(self) -> None:
         hf_quant_config = getattr(self.hf_config, "quantization_config", None)
         if hf_quant_config is not None:
             hf_quant_method = str(hf_quant_config["quant_method"]).lower()
+            # If the GPTQ model is serialized in marlin format, use marlin.
+            marlin_format_flag = "is_marlin_format"
+            if (hf_quant_method == "gptq"
+                    and marlin_format_flag in hf_quant_config
+                    and hf_quant_config[marlin_format_flag]):
+                hf_quant_method = "marlin"
             if self.quantization is None:
                 self.quantization = hf_quant_method
             elif self.quantization != hf_quant_method:
@@ -210,9 +216,11 @@ def _verify_quantization(self) -> None:
                 raise ValueError(
                     f"{self.quantization} quantization is currently not supported "
                     f"in ROCm.")
-            logger.warning(f"{self.quantization} quantization is not fully "
-                           "optimized yet. The speed can be slower than "
-                           "non-quantized models.")
+            if self.quantization != "marlin":
+                logger.warning(
+                    f"{self.quantization} quantization is not fully "
+                    "optimized yet. The speed can be slower than "
+                    "non-quantized models.")
 
     def _verify_cuda_graph(self) -> None:
         if self.max_context_len_to_capture is None:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c4084deeb3bba..8a0bd5b4ff8be 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -297,14 +297,12 @@ def create_engine_configs(
     ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig,
                DeviceConfig, Optional[LoRAConfig]]:
         device_config = DeviceConfig(self.device)
-        model_config = ModelConfig(self.model, self.tokenizer,
-                                   self.tokenizer_mode, self.trust_remote_code,
-                                   self.download_dir, self.load_format,
-                                   self.dtype, self.seed, self.revision,
-                                   self.code_revision, self.tokenizer_revision,
-                                   self.max_model_len, self.quantization, 
-                                   self.sparsity, self.enforce_eager,
-                                   self.max_context_len_to_capture)
+        model_config = ModelConfig(
+            self.model, self.tokenizer, self.tokenizer_mode,
+            self.trust_remote_code, self.download_dir, self.load_format,
+            self.dtype, self.seed, self.revision, self.code_revision,
+            self.tokenizer_revision, self.max_model_len, self.quantization,
+            self.sparsity, self.enforce_eager, self.max_context_len_to_capture)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 49e05922443d2..4da4818906db2 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -18,6 +18,14 @@
 logger = init_logger(__name__)
 
 
+def adjust_marlin_shard(param, shard_size, shard_offset):
+    marlin_tile_size = getattr(param, "marlin_tile_size", None)
+    if marlin_tile_size is None:
+        return shard_size, shard_offset
+
+    return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
+
+
 class LinearMethodBase(ABC):
     """Base class for different (maybe quantized) linear methods."""
 
@@ -282,6 +290,11 @@ def weight_loader(self,
                 if packed_dim == output_dim:
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
+
+                    # If marlin, we need to adjust the offset and size to account for the tiling.
+                    shard_size, shard_offset = adjust_marlin_shard(
+                        param, shard_size, shard_offset)
+
                 loaded_weight_shard = loaded_weight.narrow(
                     output_dim, shard_offset, shard_size)
                 self.weight_loader(param, loaded_weight_shard, shard_id)
@@ -299,6 +312,11 @@ def weight_loader(self,
             if packed_dim == output_dim:
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
+
+                # If marlin, we need to adjust the offset and size to account for the tiling.
+                shard_size, shard_offset = adjust_marlin_shard(
+                    param, shard_size, shard_offset)
+
             param_data = param_data.narrow(output_dim, shard_offset,
                                            shard_size)
             start_idx = tp_rank * shard_size
@@ -409,6 +427,11 @@ def weight_loader(self,
                 if packed_dim == output_dim:
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
+
+                    # If marlin, we need to adjust the offset and size to account for the tiling.
+                    shard_size, shard_offset = adjust_marlin_shard(
+                        param, shard_size, shard_offset)
+
                 loaded_weight_shard = loaded_weight.narrow(
                     output_dim, shard_offset, shard_size)
                 self.weight_loader(param, loaded_weight_shard, shard_id)
@@ -433,6 +456,11 @@ def weight_loader(self,
             if packed_dim == output_dim:
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
+
+                # If marlin, we need to adjust the offset and size to account for the tiling.
+                shard_size, shard_offset = adjust_marlin_shard(
+                    param, shard_size, shard_offset)
+
             param_data = param_data.narrow(output_dim, shard_offset,
                                            shard_size)
             if loaded_shard_id == "q":
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index b3449eaff0e35..dc54641878c64 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -4,11 +4,13 @@
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
+from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 
 _QUANTIZATION_CONFIG_REGISTRY = {
     "awq": AWQConfig,
     "gptq": GPTQConfig,
     "squeezellm": SqueezeLLMConfig,
+    "marlin": MarlinConfig,
 }
 
 
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
new file mode 100644
index 0000000000000..96800759b4a25
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -0,0 +1,207 @@
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm._C import ops
+from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+
+
+class MarlinConfig(QuantizationConfig):
+    """Config class for Marlin.
+
+    Reference: https://github.com/IST-DASLab/marlin/tree/master
+    """
+
+    def __init__(
+        self,
+        group_size: int,
+    ) -> None:
+        # Group size for the quantization.
+        self.group_size = group_size
+        if self.group_size != 128 and self.group_size != -1:
+            raise ValueError(
+                "Currently, only group size 128 and -1 (channelwise) is supported for "
+                f"Marlin, but got group_size of {self.group_size}")
+
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = 32 // 4
+
+        # Tile size used by marlin kernels.
+        self.tile_size = 16
+
+        # Min out_features dim
+        self.min_n_threads = 64
+
+        # Min in_features dim
+        self.min_k_threads = 128
+
+        # Max parallel problems to solve at once (improves large batch performance)
+        self.max_parallel = 16
+
+        # Permutation length used by the marlin kernels.
+        self.perm_len = 1024
+
+    def __repr__(self) -> str:
+        return f"MarlinConfig(group_size={self.group_size}"
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "marlin"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "MarlinConfig":
+        group_size = cls.get_from_keys(config, ["group_size"])
+        return cls(group_size)
+
+    def get_linear_method(self) -> "MarlinLinearMethod":
+        return MarlinLinearMethod(self)
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class MarlinLinearMethod(LinearMethodBase):
+    """Linear method for Marlin.
+
+    Args:
+        quant_config: The Marlin quantization config.
+    """
+
+    def __init__(self, quant_config: MarlinConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        input_size_per_partition: int,
+        output_size_per_partition: int,
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
+        if params_dtype != torch.float16:
+            raise ValueError(
+                f"The params dtype must be float16, but got {params_dtype}")
+
+        # Validate output_size_per_partition
+        if output_size_per_partition % self.quant_config.min_n_threads != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by min_n_threads = {self.quant_config.min_n_threads}."
+            )
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by pack_factor = {self.quant_config.pack_factor}."
+            )
+
+        # Validate input_size_per_partition
+        if input_size_per_partition % self.quant_config.min_k_threads != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = {input_size_per_partition} is not divisible by min_k_threads = {self.quant_config.min_k_threads}."
+            )
+        if self.quant_config.group_size != -1 and input_size_per_partition % self.quant_config.group_size != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = f{input_size_per_partition} is not divisible by group_size = {self.quant_config.group_size}."
+            )
+
+        # Check that we have at least 4 tiles horizontally in the shard
+        num_tiles_per_perm = self.quant_config.perm_len // (
+            self.quant_config.tile_size**2)
+        if output_size_per_partition % num_tiles_per_perm != 0:
+            raise ValueError(
+                "Each permutation group must reside on the same gpu")
+
+        # Quantized 4Bit weights packed into Int32.
+        qweight = Parameter(
+            torch.empty(
+                input_size_per_partition // self.quant_config.tile_size,
+                output_size_per_partition * self.quant_config.tile_size //
+                self.quant_config.pack_factor,
+                device="cuda",
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qweight,
+            {
+                "input_dim": 0,
+                "output_dim": 1,
+                "packed_dim": 1,
+                "pack_factor": self.quant_config.pack_factor,
+                "marlin_tile_size": self.quant_config.tile_size,
+            },
+        )
+
+        # Determine if channelwise or not
+        input_groups = 1 if self.quant_config.group_size == -1 else input_size_per_partition // self.quant_config.group_size
+
+        scales = Parameter(
+            torch.empty(
+                input_groups,
+                output_size_per_partition,
+                device="cuda",
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            scales,
+            {
+                "input_dim": None if input_groups == 1 else 0,
+                "output_dim": 1,
+            },
+        )
+
+        # Allocate workspace (Used for internal locking mechanism)
+        max_workspace_size = (
+            output_size_per_partition //
+            self.quant_config.min_n_threads) * self.quant_config.max_parallel
+        workspace = Parameter(torch.zeros(max_workspace_size,
+                                          device="cuda",
+                                          dtype=torch.int),
+                              requires_grad=False)
+
+        return {
+            "B": qweight,
+            "s": scales,
+            "workspace": workspace,
+        }
+
+    def apply_weights(
+        self,
+        weights: Dict[str, Any],
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = weights["B"]
+        scales = weights["s"]
+        workspace = weights["workspace"]
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        size_m = x_2d.shape[0]
+        size_k = x_2d.shape[1]
+        size_n = scales.shape[1]
+
+        output_2d = ops.marlin_gemm(x_2d, qweight, scales, workspace, size_m,
+                                    size_n, size_k)
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output
diff --git a/vllm/model_executor/layers/sparsity/sparse_w16a16.py b/vllm/model_executor/layers/sparsity/sparse_w16a16.py
index 1eb59bc269c27..ab348b9823caa 100644
--- a/vllm/model_executor/layers/sparsity/sparse_w16a16.py
+++ b/vllm/model_executor/layers/sparsity/sparse_w16a16.py
@@ -6,8 +6,8 @@
 from vllm.model_executor.layers.sparsity.base_config import SparsityConfig
 
 from .sparse_w16a16_linear_method import SparseW16A16LinearMethod
-from magic_wand import (CompressedStorageFormat, SparseBitmaskStorageFormat,
-                        SparseBEGemmStorageFormat)
+from magic_wand import (CompressedStorageFormat, SparseBEGemmStorageFormat,
+                        SparseBitmaskStorageFormat)
 
 logger = init_logger(__name__)
 

From 9209f1522d2407820d3ec6a7ddcc78b7ccb076c6 Mon Sep 17 00:00:00 2001
From: Andy Linfoot <78757007+andy-neuma@users.noreply.github.com>
Date: Thu, 22 Feb 2024 22:43:40 -0500
Subject: [PATCH 111/112] additional updates to "bump-to-v0.3.2" (#39)

SUMMARY
* update `TORCH_CUDA_ARCH_LIST` to match `magic_wand`
* update "test vllm" action to run tests serially
* add helper script to find *.py tests, run them serially, and output
JUnit formatted xml

TEST
working through changes manually on debug instance

---------

Co-authored-by: andy-neuma <andy@neuralmagic.com>
---
 .github/actions/nm-build-vllm/action.yml |  2 -
 .github/actions/nm-set-env/action.yml    | 13 +++--
 .github/actions/nm-test-vllm/action.yml  | 12 ++---
 .github/pull_request_template.md         |  6 +++
 .github/scripts/run-tests                | 66 ++++++++++++++++++++++++
 .github/workflows/build-test.yml         | 24 ++++++---
 .github/workflows/remote-push.yml        |  5 +-
 7 files changed, 106 insertions(+), 22 deletions(-)
 create mode 100644 .github/pull_request_template.md
 create mode 100755 .github/scripts/run-tests

diff --git a/.github/actions/nm-build-vllm/action.yml b/.github/actions/nm-build-vllm/action.yml
index 780c2f99de3c6..5218078ba1704 100644
--- a/.github/actions/nm-build-vllm/action.yml
+++ b/.github/actions/nm-build-vllm/action.yml
@@ -19,8 +19,6 @@ runs:
   steps:
   - id: build
     run: |
-      # TODO: this is a hack ... fix it later
-      # pyenv hardcoded ... python version hardcoded ...
       COMMIT=${{ github.sha }}
       VENV="${{ inputs.venv }}-${COMMIT:0:7}"
       source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
diff --git a/.github/actions/nm-set-env/action.yml b/.github/actions/nm-set-env/action.yml
index d5b108d97ba4a..863354f35dd0b 100644
--- a/.github/actions/nm-set-env/action.yml
+++ b/.github/actions/nm-set-env/action.yml
@@ -1,15 +1,20 @@
 name: set neuralmagic env
 description: 'sets environment variables for neuralmagic'
 inputs:
-  hf_home:
+  hf_token:
     description: 'Hugging Face home'
     required: true
+  Gi_per_thread:
+    description: 'requested GiB to reserve per thread'
+    required: true
 runs:
   using: composite
   steps:
   - run: |
-      echo "HF_HOME=${HF_HOME_TOKEN}" >> $GITHUB_ENV
-      echo "TORCH_CUDA_ARCH_LIST=8.0+PTX" >> $GITHUB_ENV
+      echo "HF_TOKEN=${HF_TOKEN_SECRET}" >> $GITHUB_ENV
+      NUM_THREADS=$(./.github/scripts/determine-threading -G ${{ inputs.Gi_per_thread }})
+      echo "MAX_JOBS=${NUM_THREADS}" >> $GITHUB_ENV
+      echo "VLLM_INSTALL_PUNICA_KERNELS=1" >> $GITHUB_ENV
       echo "PYENV_ROOT=/usr/local/apps/pyenv" >> $GITHUB_ENV
       echo "XDG_CONFIG_HOME=/usr/local/apps" >> $GITHUB_ENV
       WHOAMI=$(whoami)
@@ -17,5 +22,5 @@ runs:
       echo "LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64::/usr/local/cuda-12.1/lib64:" >> $GITHUB_ENV
       echo "PROJECT_ID=12" >> $GITHUB_ENV
     env:
-        HF_HOME_TOKEN: ${{ inputs.hf_home }}
+        HF_TOKEN_SECRET: ${{ inputs.hf_token }}
     shell: bash
diff --git a/.github/actions/nm-test-vllm/action.yml b/.github/actions/nm-test-vllm/action.yml
index 27dae15df0332..7d05450e4e1c2 100644
--- a/.github/actions/nm-test-vllm/action.yml
+++ b/.github/actions/nm-test-vllm/action.yml
@@ -4,8 +4,8 @@ inputs:
   test_directory:
     description: 'test directory, path is relative to neuralmagic-vllm'
     required: true
-  test_xml:
-    description: 'filename for xml test results'
+  test_results:
+    description: 'top-level directory for xml test results'
     required: true
   python:
     description: 'python version, e.g. 3.10.12'
@@ -22,15 +22,15 @@ runs:
   steps:
   - id: test
     run: |
-      SUCCESS=0
-      # TODO: this is a hack ... fix it later
-      # pyenv hardcoded ... python version hardcoded ...
       COMMIT=${{ github.sha }}
       VENV="${{ inputs.venv }}-${COMMIT:0:7}"
       source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
       pip3 install --index-url http://192.168.201.226:8080/ --trusted-host 192.168.201.226 magic-wand
       pip3 install -r requirements-dev.txt
-      pytest --junitxml=${{ inputs.test_xml }} ${{ inputs.test_directory }} || SUCCESS=$?
+      # run tests via runner script (serially)
+      SUCCESS=0
+      ./.github/scripts/run-tests -t ${{ inputs.test_directory }} -r ${{ inputs.test_results }} || SUCCESS=$?
+      echo "was this a SUCCESS? ${SUCCESS}"
       echo "status=${SUCCESS}" >> "$GITHUB_OUTPUT"
       exit ${SUCCESS}
     shell: bash
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 0000000000000..e871931956390
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,6 @@
+SUMMARY:
+"please provide a brief summary"
+
+TEST PLAN:
+"please outline how the changes were tested"
+
diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests
new file mode 100755
index 0000000000000..2c5aeb1d9826e
--- /dev/null
+++ b/.github/scripts/run-tests
@@ -0,0 +1,66 @@
+#!/bin/bash -e
+
+# simple helper script to manage concurrency while running tests
+
+usage() {
+    echo "Usage: ${0} <options>"
+    echo
+    echo "  -t    - test directory, i.e. location of *.py test files. (default 'tests/')"
+    echo "  -r    - desired results base directory. xml results will mirror provided tests directory structure. (default 'test-results/')"
+    echo "  -h    - this list of options"
+    echo
+    echo "note: all paths are relative to 'neuralmagic-vllm' root"
+    echo
+    exit 1
+}
+
+TEST_DIR=tests
+RESULTS_DIR=test-results
+
+while getopts "ht:r:" OPT; do
+    case "${OPT}" in
+	h)
+	    usage
+	    ;;
+	t)
+	    TEST_DIR="${OPTARG}"
+	    ;;
+    r)
+        RESULTS_DIR="${OPTARG}"
+        ;;
+    esac
+done
+
+# check if variables are valid
+if [ -z "${RESULTS_DIR}" ]; then
+    echo "please set desired results base directory"
+    usage
+fi
+
+if [ -z "${TEST_DIR}" ]; then
+    echo "please set test directory"
+    usage
+fi
+
+if [ ! -d "${TEST_DIR}" ]; then
+    echo "specified test directory, '${TEST_DIR}' does not exist ..."
+    usage
+fi
+
+# run tests serially
+TESTS_DOT_PY=$(find ${TEST_DIR}  -not -name "__init__.py" -name "*.py")
+TESTS_TO_RUN=($TESTS_DOT_PY)
+SUCCESS=0
+for TEST in "${TESTS_TO_RUN[@]}"
+do
+    LOCAL_SUCCESS=0
+    RESULT_XML=$(echo ${TEST} | sed -e "s/${TEST_DIR}/${RESULTS_DIR}/" | sed -e "s/.py/.xml/")
+    pytest --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
+    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+done
+
+if [ "${SUCCESS}" -eq "0" ]; then
+    exit 0
+else
+    exit 1
+fi
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
index 26a9b5cb89bcd..7d571b50adf14 100644
--- a/.github/workflows/build-test.yml
+++ b/.github/workflows/build-test.yml
@@ -15,6 +15,10 @@ on:
         description: "git commit hash or branch name"
         type: string
         required: true
+      Gi_per_thread:
+        description: 'requested GiB to reserve per thread'
+        type: string
+        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -35,6 +39,10 @@ on:
         description: "git commit hash or branch name"
         type: string
         required: true
+      Gi_per_thread:
+        description: 'requested GiB to reserve per thread'
+        type: string
+        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -61,7 +69,8 @@ jobs:
               id: setenv
               uses: ./.github/actions/nm-set-env/
               with:
-                hf_home: ${{ secrets.NM_HF_HOME }}
+                hf_token: ${{ secrets.NM_HF_TOKEN }}
+                Gi_per_thread: ${{ inputs.Gi_per_thread }}
 
             - name: set python
               id: set_python
@@ -88,7 +97,7 @@ jobs:
               id: build
               uses: ./.github/actions/nm-build-vllm/
               with:
-                Gi_per_thread: 1
+                Gi_per_thread: ${{ inputs.Gi_per_thread }}
                 python: ${{ inputs.python }}
                 venv: TEST
 
@@ -97,7 +106,7 @@ jobs:
               uses: ./.github/actions/nm-test-vllm/
               with:
                 test_directory: tests
-                test_xml: test-results/all_tests.xml
+                test_results: test-results
                 python: ${{ inputs.python }}
                 venv: TEST
 
@@ -134,12 +143,13 @@ jobs:
                 TEST_STATUS: ${{ steps.test.outputs.status }}
               run: |
                   echo "checkout status: ${CHECKOUT}"
-                  if [[ "${CHECKOUT}" != *"success"* ]]; then exit 1; fi
-                  if [ ${LINT_STATUS} -ne 0 ]; then exit 1; fi
-                  if [ ${BUILD_STATUS} -ne 0 ]; then exit 1; fi
+                  echo "lint status: ${LINT_STATUS}"
                   echo "build status: ${BUILD_STATUS}"
-                  if [ ${TEST_STATUS} -ne 0 ]; then exit 1; fi
                   echo "test status: ${TEST_STATUS}"
+                  if [[ "${CHECKOUT}" != *"success"* ]]; then exit 1; fi
+                  if [ -z "${LINT_STATUS}" ] || [ "${LINT_STATUS}" -ne "0" ]; then exit 1; fi
+                  if [ -z "${BUILD_STATUS}" ] || [ "${BUILD_STATUS}" -ne "0" ]; then exit 1; fi
+                  if [ -z "${TEST_STATUS}" ] || [ "${TEST_STATUS}" -ne "0" ]; then exit 1; fi
 
             - name: complete testmo run
               uses: ./.github/actions/nm-testmo-run-complete/
diff --git a/.github/workflows/remote-push.yml b/.github/workflows/remote-push.yml
index c10b386ceb23e..800db24fde970 100644
--- a/.github/workflows/remote-push.yml
+++ b/.github/workflows/remote-push.yml
@@ -13,8 +13,6 @@ jobs:
 
     # TODO: expand python matrix later, once CI system has
     #       matured.
-    # TODO: adjust timeout after we get a bit more experience.
-    #       making it 60 is a bit permissive.
 
     # TODO: enable this later
     AWS-AVX2-32G-A10G-24G:
@@ -24,7 +22,8 @@ jobs:
         uses: ./.github/workflows/build-test.yml
         with:
             label: aws-avx2-32G-a10g-24G
-            timeout: 60
+            timeout: 180
             gitref: '${{ github.ref }}'
+            Gi_per_thread: 4
             python: ${{ matrix.python }}
         secrets: inherit

From b1e14c221e53b467f1b0d34833ac3710fb61179d Mon Sep 17 00:00:00 2001
From: andy-neuma <andy@neuralmagic.com>
Date: Fri, 23 Feb 2024 08:34:36 -0500
Subject: [PATCH 112/112] move to 4 x gpu

---
 .github/workflows/remote-push.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/remote-push.yml b/.github/workflows/remote-push.yml
index 800db24fde970..0563ed0d01c80 100644
--- a/.github/workflows/remote-push.yml
+++ b/.github/workflows/remote-push.yml
@@ -15,13 +15,13 @@ jobs:
     #       matured.
 
     # TODO: enable this later
-    AWS-AVX2-32G-A10G-24G:
+    AWS-AVX2-192G-4-A10G-96G:
         strategy:
             matrix:
                 python: [3.10.12]
         uses: ./.github/workflows/build-test.yml
         with:
-            label: aws-avx2-32G-a10g-24G
+            label: aws-avx2-192G-4-a10g-96G
             timeout: 180
             gitref: '${{ github.ref }}'
             Gi_per_thread: 4