Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

支持 BelleGroup/BELLE-LLAMA-7B-2M-gptq #12

Merged
merged 8 commits into from
Apr 1, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Create llama_gptq.py
ypw committed Apr 1, 2023
commit 6b28cb75aa1329c02f157ca31eadabfd8f1dd7a2
71 changes: 71 additions & 0 deletions predictors/llama_gptq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import time
import torch
import transformers
from predictors.llama import LLaMa
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoTokenizer, LlamaForCausalLM
from gptq.llama_inference import load_quant
from transformers.utils.hub import cached_file


class LLaMaGPTQ(LLaMa):
def __init__(self, model_name, checkpoint_path='llama7b-2m-4bit-128g.pt', wbits=4, groupsize=128):
print(f'Loading model {model_name}')
start = time.perf_counter()
self.model_name = model_name
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.tokenizer = AutoTokenizer.from_pretrained(
model_name, resume_download=True)
checkpoint_path = cached_file(model_name, checkpoint_path)
print(f'Loading model from {checkpoint_path} ...')
model: LlamaForCausalLM = load_quant(model_name, checkpoint_path, wbits, groupsize)
model.eval()
self.model = model
end = time.perf_counter()
print(f'Successfully loaded model {model_name}, time cost: {end - start:.2f}s')


def test():
model_name = 'BelleGroup/BELLE-LLAMA-7B-2M-gptq'
checkpoint_path = 'llama7b-2m-4bit-128g.pt'
wbits = 4
groupsize = 128

predictor = LLaMaGPTQ(model_name, checkpoint_path, wbits, groupsize)
device = predictor.device
tokenizer = predictor.tokenizer
model = predictor.model
min_length = 10
max_length = 2048
top_p = 0.95
temperature = 0.8

print("Human:")
line = input()
inputs = 'Human: ' + line.strip() + '\n\nAssistant:'
input_ids = tokenizer.encode(inputs, return_tensors="pt").to(device)

with torch.no_grad():
generated_ids = model.generate(
input_ids,
do_sample=True,
min_length=min_length,
max_length=max_length,
top_p=top_p,
temperature=temperature,
)
print("Assistant:\n【")
print(tokenizer.decode([el.item() for el in generated_ids[0]]))
print("】\n-------------------------------\n")

for x in predictor.predict_continue(
line, '', max_length, top_p, temperature, [True], None):
print("Assistant:\n【")
print(x[0][-1][1])
print("】\n-------------------------------\n")


if __name__ == '__main__':
test()