|
@ -3,11 +3,13 @@ RunPod | Transformer | Handler |
|
|
''' |
|
|
''' |
|
|
import argparse |
|
|
import argparse |
|
|
|
|
|
|
|
|
|
|
|
import accelerate |
|
|
import torch |
|
|
import torch |
|
|
import runpod |
|
|
import runpod |
|
|
from runpod.serverless.utils.rp_validator import validate |
|
|
from runpod.serverless.utils.rp_validator import validate |
|
|
from transformers import (GPTNeoForCausalLM, GPT2Tokenizer, GPTNeoXForCausalLM, |
|
|
from transformers import (GPTNeoForCausalLM, GPT2Tokenizer, GPTNeoXForCausalLM, |
|
|
GPTNeoXTokenizerFast, GPTJForCausalLM, AutoTokenizer, AutoModelForCausalLM,) |
|
|
GPTNeoXTokenizerFast, GPTJForCausalLM, AutoTokenizer, AutoModelForCausalLM, |
|
|
|
|
|
AutoConfig) |
|
|
# LlamaForCausalLM, LlamaTokenizer) |
|
|
# LlamaForCausalLM, LlamaTokenizer) |
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -99,6 +101,81 @@ INPUT_SCHEMA = { |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_quantized(model_name, wbits, groupsize): |
|
|
|
|
|
"""https://github.com/oobabooga/text-generation-webui/blob/main/modules/GPTQ_loader.py""" |
|
|
|
|
|
from pathlib import Path |
|
|
|
|
|
import os, sys |
|
|
|
|
|
# os.system("mkdir repositories && git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa") |
|
|
|
|
|
sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa"))) |
|
|
|
|
|
from modelutils import find_layers |
|
|
|
|
|
from quant import make_quant |
|
|
|
|
|
|
|
|
|
|
|
import transformers |
|
|
|
|
|
from transformers import AutoConfig |
|
|
|
|
|
|
|
|
|
|
|
# Try to determine model type from model name |
|
|
|
|
|
name = model_name.lower() |
|
|
|
|
|
if any((k in name for k in ['llama', 'alpaca', 'vicuna'])): |
|
|
|
|
|
model_type = 'llama' |
|
|
|
|
|
elif any((k in name for k in ['opt-', 'galactica'])): |
|
|
|
|
|
model_type = 'opt' |
|
|
|
|
|
elif any((k in name for k in ['gpt-j', 'pygmalion-6b'])): |
|
|
|
|
|
model_type = 'gptj' |
|
|
|
|
|
else: |
|
|
|
|
|
print("Can't determine model type from model name." |
|
|
|
|
|
"argument") |
|
|
|
|
|
exit() |
|
|
|
|
|
|
|
|
|
|
|
# Now we are going to try to locate the quantized model file. |
|
|
|
|
|
#path_to_model = Path(f'/workdir/model') |
|
|
|
|
|
path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--*/snapshots/*/") ) |
|
|
|
|
|
found_pts = list(path_to_model.glob("*.pt")) |
|
|
|
|
|
found_safetensors = list(path_to_model.glob("*.safetensors")) |
|
|
|
|
|
pt_path = None |
|
|
|
|
|
|
|
|
|
|
|
if len(found_pts) == 1: |
|
|
|
|
|
pt_path = found_pts[0] |
|
|
|
|
|
elif len(found_safetensors) == 1: |
|
|
|
|
|
pt_path = found_safetensors[0] |
|
|
|
|
|
else: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
if not pt_path: |
|
|
|
|
|
print("Could not find the quantized model in .pt or .safetensors format, exiting...") |
|
|
|
|
|
exit() |
|
|
|
|
|
|
|
|
|
|
|
config = AutoConfig.from_pretrained(str(path_to_model)) |
|
|
|
|
|
def noop(*args, **kwargs): |
|
|
|
|
|
pass |
|
|
|
|
|
torch.nn.init.kaiming_uniform_ = noop |
|
|
|
|
|
torch.nn.init.uniform_ = noop |
|
|
|
|
|
torch.nn.init.normal_ = noop |
|
|
|
|
|
|
|
|
|
|
|
torch.set_default_dtype(torch.half) |
|
|
|
|
|
transformers.modeling_utils._init_weights = False |
|
|
|
|
|
torch.set_default_dtype(torch.half) |
|
|
|
|
|
model = AutoModelForCausalLM.from_config(config) |
|
|
|
|
|
torch.set_default_dtype(torch.float) |
|
|
|
|
|
model = model.eval() |
|
|
|
|
|
layers = find_layers(model) |
|
|
|
|
|
for name in ['lm_head']: |
|
|
|
|
|
if name in layers: |
|
|
|
|
|
del layers[name] |
|
|
|
|
|
make_quant(model, layers, wbits, groupsize) |
|
|
|
|
|
del layers |
|
|
|
|
|
|
|
|
|
|
|
print('Loading model ...') |
|
|
|
|
|
if str(pt_path).endswith('.safetensors'): |
|
|
|
|
|
from safetensors.torch import load_file as safe_load |
|
|
|
|
|
model.load_state_dict(safe_load(str(pt_path))) |
|
|
|
|
|
else: |
|
|
|
|
|
model.load_state_dict(torch.load(str(pt_path))) |
|
|
|
|
|
model.seqlen = 2048 |
|
|
|
|
|
print('Done.') |
|
|
|
|
|
|
|
|
|
|
|
return model |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generator(job): |
|
|
def generator(job): |
|
|
''' |
|
|
''' |
|
|
Run the job input to generate text output. |
|
|
Run the job input to generate text output. |
|
@ -169,8 +246,9 @@ if __name__ == "__main__": |
|
|
"PygmalionAI/pygmalion-6b", local_files_only=True) |
|
|
"PygmalionAI/pygmalion-6b", local_files_only=True) |
|
|
|
|
|
|
|
|
elif args.model_name == 'pygmalion-6b-4bit-128g': |
|
|
elif args.model_name == 'pygmalion-6b-4bit-128g': |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
# model = AutoModelForCausalLM.from_pretrained( |
|
|
"mayaeary/pygmalion-6b-4bit-128g", local_files_only=True).to(device) |
|
|
# "mayaeary/pygmalion-6b-4bit-128g", local_files_only=True).to(device) |
|
|
|
|
|
model = load_quantized("pygmalion-6b-4bit-128g", 4, 128).to(device) |
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
|
"mayaeary/pygmalion-6b-4bit-128g", local_files_only=True) |
|
|
"mayaeary/pygmalion-6b-4bit-128g", local_files_only=True) |
|
|
|
|
|
|
|
|