ypwhs · Apr 1, 2023
diff --git a/‎app.py
+5-1 b/‎app.py
+5-1
diff --git a/‎gptq/README.md
+70 b/‎gptq/README.md
+70
diff --git a/‎gptq/gptq.py
+163 b/‎gptq/gptq.py
+163
@@ -10,11 +10,15 @@
 # 加载模型
 # model_name = 'THUDM/chatglm-6b'
 # model_name = 'BelleGroup/BELLE-LLAMA-7B-2M'
-model_name = 'silver/chatglm-6b-int4-slim'
+# model_name = 'silver/chatglm-6b-int4-slim'
+model_name = 'BelleGroup/BELLE-LLAMA-7B-2M-gptq'
 
 if 'chatglm' in model_name.lower():
     from predictors.chatglm import ChatGLM
     predictor = ChatGLM(model_name)
+elif 'gptq' in model_name.lower():
+    from predictors.llama_gptq import LLaMaGPTQ
+    predictor = LLaMaGPTQ(model_name)
 elif 'llama' in model_name.lower():
     from predictors.llama import LLaMa
     predictor = LLaMa(model_name)
 
@@ -0,0 +1,70 @@
+# GPTQ-for-Bloom & LLaMa
+8 bits quantization of [Bloom](https://arxiv.org/pdf/2211.05100.pdf) using [GPTQ](https://arxiv.org/abs/2210.17323)
+
+GPTQ is SOTA one-shot weight quantization method
+
+**This code is based on [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa)**
+
+## [Huggingface models](https://huggingface.co/BelleGroup/BELLE-7B-gptq) 
+
+
+| model name       |  file size | GPU memory usage |
+| -------------------------------------------------- |  ------------------- | ------------------ |
+|           base                 |          27G        |       ~28.2G         |
+|           bloom7b-2m-8bit-128g.pt                  |          9.7G        |       ~11.4G          |
+|           bloom7b-2m-4bit-128g.pt                  |          6.9G        |        ~8.4G          |
+|           bloom7b-0.2m-8bit-128g.pt                  |          9.7G        |       ~11.4G          |
+|           bloom7b-0.2m-4bit-128g.pt                  |          6.9G        |        ~8.4G          |
+
+
+All experiments were run on a single NVIDIA A100.
+
+## Installation
+If you don't have [conda](https://docs.conda.io/en/latest/miniconda.html), install it first.
+```
+conda create --name gptq python=3.9 -y
+conda activate gptq
+conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
+# Or, if you're having trouble with conda, use pip with python3.9:
+# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
+
+pip install -r requirements.txt
+python setup_cuda.py install
+
+# Benchmark performance for FC2 layer of LLaMa-7B
+CUDA_VISIBLE_DEVICES=0 python test_kernel.py
+```
+## Dependencies
+
+* `torch`: tested on v2.0.0+cu117
+* `transformers`: tested on v4.28.0.dev0
+* `datasets`: tested on v2.10.1
+* `safetensors`: tested on v0.3.0
+* (to run 4-bit kernels: setup for compiling PyTorch CUDA extensions, see also https://pytorch.org/tutorials/advanced/cpp_extension.html, tested on CUDA 11.7)
+
+
+## Model inference with the saved model
+```
+# BELLE-7B-gptq: local saved model path from Huggingface
+git lfs install
+git clone https://huggingface.co/BelleGroup/BELLE-7B-gptq
+# model inference with the saved model
+CUDA_VISIBLE_DEVICES=0 python bloom_inference.py BELLE-7B-gptq --wbits 8 --groupsize 128 --load BELLE-7B-gptq/bloom7b-2m-8bit-128g.pt --text "hello"
+```
+
+## Model quantization
+
+```
+# BELLE-7B-gptq: local saved model path
+# Save compressed model
+CUDA_VISIBLE_DEVICES=0 python bloom.py BelleGroup/BELLE-7B-2M wikitext2 --wbits 8 --groupsize 128 --save BELLE-7B-gptq/bloom7b-2m-8bit-128g.pt
+
+```
+CUDA Kernels support 2,3,4,8 bits.
+
+Basically, 8-bit quantization and 128 groupsize are recommended.
+
+# Acknowledgements
+This code is based on [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa)
+
+Thanks to [Bloom](https://arxiv.org/pdf/2211.05100.pdf), a powerful LLM.
@@ -0,0 +1,163 @@
+import math
+import time
+
+import torch
+import torch.nn as nn
+import transformers
+
+from gptq.quant import *
+
+
+DEBUG = False 
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+
+
+class GPTQ:
+    def __init__(self, layer):
+        self.layer = layer
+        self.dev = self.layer.weight.device
+        W = layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        self.rows = W.shape[0]
+        self.columns = W.shape[1]
+        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
+        self.nsamples = 0
+
+    def add_batch(self, inp, out):
+        if DEBUG:
+            self.inp1 = inp
+            self.out1 = out
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+        tmp = inp.shape[0]
+        if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D):
+            if len(inp.shape) == 3:
+                inp = inp.reshape((-1, inp.shape[-1]))
+            inp = inp.t()
+        if isinstance(self.layer, nn.Conv2d):
+            unfold = nn.Unfold(
+                self.layer.kernel_size,
+                dilation=self.layer.dilation,
+                padding=self.layer.padding,
+                stride=self.layer.stride
+            )
+            inp = unfold(inp)
+            inp = inp.permute([1, 0, 2])
+            inp = inp.flatten(1)
+        self.H *= self.nsamples / (self.nsamples + tmp)
+        self.nsamples += tmp
+        # inp = inp.float()
+        inp = math.sqrt(2 / self.nsamples) * inp.float()
+        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
+        self.H += inp.matmul(inp.t())
+
+    def fasterquant(
+        self, blocksize=128, percdamp=.01, groupsize=-1
+    ):
+        W = self.layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        W = W.float()
+
+        tick = time.time()
+
+        if not self.quantizer.ready():
+            self.quantizer.find_params(W, weight=True)
+
+        H = self.H
+        del self.H
+        dead = torch.diag(H) == 0
+        H[dead, dead] = 1
+        W[:, dead] = 0
+
+        Losses = torch.zeros_like(W)
+        Q = torch.zeros_like(W)
+
+        damp = percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(self.columns, device=self.dev)
+        H[diag, diag] += damp
+        H = torch.linalg.cholesky(H)
+        H = torch.cholesky_inverse(H)
+        H = torch.linalg.cholesky(H, upper=True)
+        Hinv = H
+        
+        scale = []
+        zero = []
+        now_idx = 1
+
+        for i1 in range(0, self.columns, blocksize):
+            i2 = min(i1 + blocksize, self.columns)
+            count = i2 - i1
+
+            W1 = W[:, i1:i2].clone()
+            Q1 = torch.zeros_like(W1)
+            Err1 = torch.zeros_like(W1)
+            Losses1 = torch.zeros_like(W1)
+            Hinv1 = Hinv[i1:i2, i1:i2]
+
+            for i in range(count):
+                w = W1[:, i]
+                d = Hinv1[i, i]
+
+                if groupsize != -1:
+                    if (i1 + i) % groupsize == 0:
+                        self.quantizer.find_params(W[:, (i1 + i):(i1 + i + groupsize)], weight=True)
+                    
+                    if ((i1 + i) // groupsize) - now_idx == -1:
+                        scale.append(self.quantizer.scale)
+                        zero.append(self.quantizer.zero)
+                        now_idx += 1
+
+                q = quantize(
+                    w.unsqueeze(1), self.quantizer.scale, self.quantizer.zero, self.quantizer.maxq
+                ).flatten()
+                Q1[:, i] = q
+                Losses1[:, i] = (w - q) ** 2 / d ** 2
+
+                err1 = (w - q) / d
+                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                Err1[:, i] = err1
+
+            Q[:, i1:i2] = Q1
+            Losses[:, i1:i2] = Losses1 / 2
+
+            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+
+            if DEBUG:
+                self.layer.weight.data[:, :i2] = Q[:, :i2]
+                self.layer.weight.data[:, i2:] = W[:, i2:]
+                print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+                print(torch.sum(Losses))
+
+        torch.cuda.synchronize()
+        print('time %.2f' % (time.time() - tick))
+        print('error', torch.sum(Losses).item())
+
+        if isinstance(self.layer, transformers.Conv1D):
+            Q = Q.t()
+        self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
+        if DEBUG:
+            print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+            
+        if scale == []:
+            scale.append(self.quantizer.scale)
+            zero.append(self.quantizer.zero)
+        scale = torch.cat(scale,dim=1)
+        zero = torch.cat(zero,dim=1)
+        return scale,zero
+            
+    def free(self):
+        if DEBUG:
+            self.inp1 = None
+            self.out1 = None
+        self.H = None
+        self.Losses = None
+        self.Trace = None
+        torch.cuda.empty_cache()