|
|
|
'''
|
|
|
|
RunPod | Transformer | Handler
|
|
|
|
'''
|
|
|
|
import argparse
|
|
|
|
|
|
|
|
import accelerate
|
|
|
|
import torch
|
|
|
|
import runpod
|
|
|
|
from runpod.serverless.utils.rp_validator import validate
|
|
|
|
from transformers import (GPTNeoForCausalLM, GPT2Tokenizer, GPTNeoXForCausalLM,
|
|
|
|
GPTNeoXTokenizerFast, GPTJForCausalLM, AutoTokenizer, AutoModelForCausalLM,
|
|
|
|
AutoConfig)
|
|
|
|
# LlamaForCausalLM, LlamaTokenizer)
|
|
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
import os, sys
|
|
|
|
|
|
|
|
|
|
|
|
torch.cuda.is_available()
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
|
|
|
INPUT_SCHEMA = {
|
|
|
|
'prompt': {
|
|
|
|
'type': str,
|
|
|
|
'required': True
|
|
|
|
},
|
|
|
|
'do_sample': {
|
|
|
|
'type': bool,
|
|
|
|
'required': False,
|
|
|
|
'default': True,
|
|
|
|
'description': '''
|
|
|
|
Enables decoding strategies such as multinomial sampling,
|
|
|
|
beam-search multinomial sampling, Top-K sampling and Top-p sampling.
|
|
|
|
All these strategies select the next token from the probability distribution
|
|
|
|
over the entire vocabulary with various strategy-specific adjustments.
|
|
|
|
'''
|
|
|
|
},
|
|
|
|
'max_length': {
|
|
|
|
'type': int,
|
|
|
|
'required': False,
|
|
|
|
'default': 100
|
|
|
|
},
|
|
|
|
'temperature': {
|
|
|
|
'type': float,
|
|
|
|
'required': False,
|
|
|
|
'default': 0.9
|
|
|
|
},
|
|
|
|
'repetition_penalty': {
|
|
|
|
'type': float,
|
|
|
|
'required': False,
|
|
|
|
'default': 1.1
|
|
|
|
},
|
|
|
|
'top_p': {
|
|
|
|
'type': float,
|
|
|
|
'required': False,
|
|
|
|
'default': 0.5
|
|
|
|
},
|
|
|
|
'top_k': {
|
|
|
|
'type': int,
|
|
|
|
'required': False,
|
|
|
|
'default': 40
|
|
|
|
},
|
|
|
|
'typical_p': {
|
|
|
|
'type': float,
|
|
|
|
'required': False,
|
|
|
|
'default': 1.0
|
|
|
|
},
|
|
|
|
'encoder_repetition_penalty': {
|
|
|
|
'type': float,
|
|
|
|
'required': False,
|
|
|
|
'default': 1.0
|
|
|
|
},
|
|
|
|
'min_length': {
|
|
|
|
'type': int,
|
|
|
|
'required': False,
|
|
|
|
'default': 0
|
|
|
|
},
|
|
|
|
'num_beams': {
|
|
|
|
'type': int,
|
|
|
|
'required': False,
|
|
|
|
'default': 1
|
|
|
|
},
|
|
|
|
'early_stopping': {
|
|
|
|
'type': bool,
|
|
|
|
'required': False,
|
|
|
|
'default': False
|
|
|
|
},
|
|
|
|
'penalty_alpha': {
|
|
|
|
'type': float,
|
|
|
|
'required': False,
|
|
|
|
'default': 0.0
|
|
|
|
},
|
|
|
|
'length_penalty': {
|
|
|
|
'type': float,
|
|
|
|
'required': False,
|
|
|
|
'default': 1.0
|
|
|
|
},
|
|
|
|
'no_repeat_ngram_size': {
|
|
|
|
'type': int,
|
|
|
|
'required': False,
|
|
|
|
'default': 0
|
|
|
|
},
|
|
|
|
'chat_generation_attempts': {
|
|
|
|
'type': int,
|
|
|
|
'required': False,
|
|
|
|
'default': 1
|
|
|
|
},
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def load_quantized(model_name, wbits, groupsize, device):
|
|
|
|
"""https://github.com/oobabooga/text-generation-webui/blob/main/modules/GPTQ_loader.py"""
|
|
|
|
# os.system("mkdir repositories && git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa")
|
|
|
|
sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
|
|
|
|
from modelutils import find_layers
|
|
|
|
from quant import make_quant
|
|
|
|
|
|
|
|
import transformers
|
|
|
|
from transformers import AutoConfig
|
|
|
|
|
|
|
|
# Try to determine model type from model name
|
|
|
|
name = model_name.lower()
|
|
|
|
if any((k in name for k in ['llama', 'alpaca', 'vicuna'])):
|
|
|
|
model_type = 'llama'
|
|
|
|
elif any((k in name for k in ['opt-', 'galactica'])):
|
|
|
|
model_type = 'opt'
|
|
|
|
elif any((k in name for k in ['gpt-j', 'pygmalion-6b'])):
|
|
|
|
model_type = 'gptj'
|
|
|
|
else:
|
|
|
|
print("Can't determine model type from model name."
|
|
|
|
"argument")
|
|
|
|
exit()
|
|
|
|
|
|
|
|
# Now we are going to try to locate the quantized model file.
|
|
|
|
#path_to_model = Path(f'/workdir/model')
|
|
|
|
path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--*/snapshots/*/") )
|
|
|
|
found_pts = list(path_to_model.glob("*.pt"))
|
|
|
|
found_safetensors = list(path_to_model.glob("*.safetensors"))
|
|
|
|
pt_path = None
|
|
|
|
|
|
|
|
if len(found_pts) == 1:
|
|
|
|
pt_path = found_pts[0]
|
|
|
|
elif len(found_safetensors) == 1:
|
|
|
|
pt_path = found_safetensors[0]
|
|
|
|
else:
|
|
|
|
pass
|
|
|
|
|
|
|
|
if not pt_path:
|
|
|
|
print("Could not find the quantized model in .pt or .safetensors format, exiting...")
|
|
|
|
exit()
|
|
|
|
|
|
|
|
config = AutoConfig.from_pretrained(str(path_to_model))
|
|
|
|
def noop(*args, **kwargs):
|
|
|
|
pass
|
|
|
|
torch.nn.init.kaiming_uniform_ = noop
|
|
|
|
torch.nn.init.uniform_ = noop
|
|
|
|
torch.nn.init.normal_ = noop
|
|
|
|
|
|
|
|
torch.set_default_dtype(torch.half)
|
|
|
|
transformers.modeling_utils._init_weights = False
|
|
|
|
torch.set_default_dtype(torch.half)
|
|
|
|
model = AutoModelForCausalLM.from_config(config)
|
|
|
|
torch.set_default_dtype(torch.float)
|
|
|
|
model = model.eval()
|
|
|
|
layers = find_layers(model)
|
|
|
|
for name in ['lm_head']:
|
|
|
|
if name in layers:
|
|
|
|
del layers[name]
|
|
|
|
make_quant(model, layers, wbits, groupsize)
|
|
|
|
del layers
|
|
|
|
|
|
|
|
print('Loading model ...')
|
|
|
|
if str(pt_path).endswith('.safetensors'):
|
|
|
|
from safetensors.torch import load_file as safe_load
|
|
|
|
if device == -1:
|
|
|
|
device = "cpu"
|
|
|
|
model.load_state_dict(safe_load(str(pt_path)), strict = False)
|
|
|
|
else:
|
|
|
|
model.load_state_dict(torch.load(str(pt_path)), strict = False)
|
|
|
|
model.seqlen = 2048
|
|
|
|
print('Done.')
|
|
|
|
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
def generator(job):
|
|
|
|
'''
|
|
|
|
Run the job input to generate text output.
|
|
|
|
'''
|
|
|
|
# Validate the input
|
|
|
|
val_input = validate(job['input'], INPUT_SCHEMA)
|
|
|
|
if 'errors' in val_input:
|
|
|
|
return {"error": val_input['errors']}
|
|
|
|
val_input = val_input['validated_input']
|
|
|
|
|
|
|
|
input_ids = tokenizer(val_input['prompt'], return_tensors="pt").input_ids.to(device)
|
|
|
|
|
|
|
|
output = []
|
|
|
|
for i in range(val_input['chat_generation_attempts']):
|
|
|
|
gen_tokens = model.generate(
|
|
|
|
input_ids,
|
|
|
|
do_sample=val_input['do_sample'],
|
|
|
|
temperature=val_input['temperature'],
|
|
|
|
max_length=val_input['max_length'],
|
|
|
|
repetition_penalty=val_input['repetition_penalty'],
|
|
|
|
top_p=val_input['top_p'],
|
|
|
|
top_k=val_input['top_k'],
|
|
|
|
typical_p=val_input['typical_p'],
|
|
|
|
encoder_repetition_penalty=val_input['encoder_repetition_penalty'],
|
|
|
|
min_length=val_input['min_length'],
|
|
|
|
num_beams=val_input['num_beams'],
|
|
|
|
early_stopping=val_input['early_stopping'],
|
|
|
|
penalty_alpha=val_input['penalty_alpha'],
|
|
|
|
length_penalty=val_input['length_penalty'],
|
|
|
|
no_repeat_ngram_size=val_input['no_repeat_ngram_size'],
|
|
|
|
).to(device)
|
|
|
|
|
|
|
|
gen_text = tokenizer.batch_decode(gen_tokens)[0]
|
|
|
|
if val_input['chat_generation_attempts'] == 1:
|
|
|
|
output = gen_text
|
|
|
|
else:
|
|
|
|
output.append(gen_text)
|
|
|
|
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------- #
|
|
|
|
# Parse Arguments #
|
|
|
|
# ---------------------------------------------------------------------------- #
|
|
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
|
|
parser.add_argument("--model_name", type=str,
|
|
|
|
default="gpt-neo-1.3B", help="URL of the model to download.")
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
# --------------------------------- Neo 1.3B --------------------------------- #
|
|
|
|
if args.model_name == 'gpt-neo-1.3B':
|
|
|
|
model = GPTNeoForCausalLM.from_pretrained(
|
|
|
|
"EleutherAI/gpt-neo-1.3B", local_files_only=True).to(device)
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B", local_files_only=True)
|
|
|
|
|
|
|
|
elif args.model_name == 'gpt-neo-2.7B':
|
|
|
|
model = GPTNeoForCausalLM.from_pretrained(
|
|
|
|
"EleutherAI/gpt-neo-2.7B", local_files_only=True, torch_dtype=torch.float16).to(device)
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B", local_files_only=True)
|
|
|
|
|
|
|
|
elif args.model_name == 'gpt-neox-20b':
|
|
|
|
model = GPTNeoXForCausalLM.from_pretrained(
|
|
|
|
"EleutherAI/gpt-neox-20b", local_files_only=True).half().to(device)
|
|
|
|
tokenizer = GPTNeoXTokenizerFast.from_pretrained(
|
|
|
|
"EleutherAI/gpt-neox-20b", local_files_only=True)
|
|
|
|
|
|
|
|
elif args.model_name == 'pygmalion-6b':
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
"PygmalionAI/pygmalion-6b", local_files_only=True, low_cpu_mem_usage=True).to(device)
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
"PygmalionAI/pygmalion-6b", local_files_only=True)
|
|
|
|
|
|
|
|
elif args.model_name == 'pygmalion-6b-4bit-128g':
|
|
|
|
# model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
# "mayaeary/pygmalion-6b-4bit-128g", local_files_only=True).to(device)
|
|
|
|
model = load_quantized("pygmalion-6b-4bit-128g", 4, 128, device).to(device)
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
"mayaeary/pygmalion-6b-4bit-128g", local_files_only=True)
|
|
|
|
|
|
|
|
elif args.model_name == 'pygmalion-6b-gptq-4bit':
|
|
|
|
model = load_quantized("pygmalion-6b-gptq-4bit", 4, 128, device).to(device)
|
|
|
|
# model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
# "OccamRazor/pygmalion-6b-gptq-4bit", local_files_only=True, from_pt=True).to(device)
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
"OccamRazor/pygmalion-6b-gptq-4bit", local_files_only=True)
|
|
|
|
|
|
|
|
elif args.model_name == 'gpt-j-6b':
|
|
|
|
model = GPTJForCausalLM.from_pretrained(
|
|
|
|
"EleutherAI/gpt-j-6B", local_files_only=True, revision="float16",
|
|
|
|
torch_dtype=torch.float16).to(device)
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
"EleutherAI/gpt-j-6B", local_files_only=True)
|
|
|
|
|
|
|
|
elif args.model_name == 'ppo-shygmalion-6b':
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
"TehVenom/PPO_Shygmalion-6b", local_files_only=True).to(device)
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
"TehVenom/PPO_Shygmalion-6b", local_files_only=True)
|
|
|
|
|
|
|
|
elif args.model_name == 'dolly-shygmalion-6b':
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
"TehVenom/Dolly_Shygmalion-6b", local_files_only=True).to(device)
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
"TehVenom/Dolly_Shygmalion-6b", local_files_only=True)
|
|
|
|
|
|
|
|
elif args.model_name == 'erebus-13b':
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
"KoboldAI/OPT-13B-Erebus", local_files_only=True).to(device)
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
"KoboldAI/OPT-13B-Erebus", local_files_only=True)
|
|
|
|
|
|
|
|
elif args.model_name == 'gpt4-x-alpaca-13b-native-4bit-128g':
|
|
|
|
pass
|
|
|
|
# model = LlamaForCausalLM.from_pretrained(
|
|
|
|
# "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g", local_files_only=True).to(device)
|
|
|
|
# tokenizer = LlamaTokenizer.from_pretrained(
|
|
|
|
# "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g", local_files_only=True)
|
|
|
|
|
|
|
|
elif args.model_name == 'gpt4-x-alpaca':
|
|
|
|
pass
|
|
|
|
# model = LlamaForCausalLM.from_pretrained(
|
|
|
|
# "chavinlo/gpt4-x-alpaca", local_files_only=True).to(device)
|
|
|
|
# tokenizer = LlamaTokenizer.from_pretrained(
|
|
|
|
# "chavinlo/gpt4-x-alpaca", local_files_only=True)
|
|
|
|
|
|
|
|
elif args.model_name == 'rwkv-4-raven-7b':
|
|
|
|
from RWKV import RWKVModel, RWKVTokenizer
|
|
|
|
path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--*/snapshots/*/") )
|
|
|
|
found_pths = list(path_to_model.glob("*.pth"))
|
|
|
|
pt_path = None
|
|
|
|
if len(found_pths) == 1:
|
|
|
|
pt_path = found_pts[0]
|
|
|
|
else:
|
|
|
|
print("Could not find the model, exiting...")
|
|
|
|
exit()
|
|
|
|
model = RWKVModel.from_pretrained(Path(str(pt_path)), dtype="fp16", device="cuda")
|
|
|
|
tokenizer = RWKVTokenizer.from_pretrained(Path(str(path_to_model)))
|
|
|
|
|
|
|
|
|
|
|
|
runpod.serverless.start({"handler": generator})
|