''' RunPod | Transformer | Handler ''' import argparse import accelerate import torch import runpod from runpod.serverless.utils.rp_validator import validate from transformers import (GPTNeoForCausalLM, GPT2Tokenizer, GPTNeoXForCausalLM, GPTNeoXTokenizerFast, GPTJForCausalLM, AutoTokenizer, AutoModelForCausalLM, AutoConfig) # LlamaForCausalLM, LlamaTokenizer) from pathlib import Path import os, sys torch.cuda.is_available() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") INPUT_SCHEMA = { 'prompt': { 'type': str, 'required': True }, 'do_sample': { 'type': bool, 'required': False, 'default': True, 'description': ''' Enables decoding strategies such as multinomial sampling, beam-search multinomial sampling, Top-K sampling and Top-p sampling. All these strategies select the next token from the probability distribution over the entire vocabulary with various strategy-specific adjustments. ''' }, 'max_length': { 'type': int, 'required': False, 'default': 100 }, 'temperature': { 'type': float, 'required': False, 'default': 0.9 }, 'repetition_penalty': { 'type': float, 'required': False, 'default': 1.1 }, 'top_p': { 'type': float, 'required': False, 'default': 0.5 }, 'top_k': { 'type': int, 'required': False, 'default': 40 }, 'typical_p': { 'type': float, 'required': False, 'default': 1.0 }, 'encoder_repetition_penalty': { 'type': float, 'required': False, 'default': 1.0 }, 'min_length': { 'type': int, 'required': False, 'default': 0 }, 'num_beams': { 'type': int, 'required': False, 'default': 1 }, 'early_stopping': { 'type': bool, 'required': False, 'default': False }, 'penalty_alpha': { 'type': float, 'required': False, 'default': 0.0 }, 'length_penalty': { 'type': float, 'required': False, 'default': 1.0 }, 'no_repeat_ngram_size': { 'type': int, 'required': False, 'default': 0 }, } def load_quantized(model_name, wbits, groupsize, device): """https://github.com/oobabooga/text-generation-webui/blob/main/modules/GPTQ_loader.py""" # os.system("mkdir repositories && git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa") sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa"))) from modelutils import find_layers from quant import make_quant import transformers from transformers import AutoConfig # Try to determine model type from model name name = model_name.lower() if any((k in name for k in ['llama', 'alpaca', 'vicuna'])): model_type = 'llama' elif any((k in name for k in ['opt-', 'galactica'])): model_type = 'opt' elif any((k in name for k in ['gpt-j', 'pygmalion-6b'])): model_type = 'gptj' else: print("Can't determine model type from model name." "argument") exit() # Now we are going to try to locate the quantized model file. #path_to_model = Path(f'/workdir/model') path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--*/snapshots/*/") ) found_pts = list(path_to_model.glob("*.pt")) found_safetensors = list(path_to_model.glob("*.safetensors")) pt_path = None if len(found_pts) == 1: pt_path = found_pts[0] elif len(found_safetensors) == 1: pt_path = found_safetensors[0] else: pass if not pt_path: print("Could not find the quantized model in .pt or .safetensors format, exiting...") exit() config = AutoConfig.from_pretrained(str(path_to_model)) def noop(*args, **kwargs): pass torch.nn.init.kaiming_uniform_ = noop torch.nn.init.uniform_ = noop torch.nn.init.normal_ = noop torch.set_default_dtype(torch.half) transformers.modeling_utils._init_weights = False torch.set_default_dtype(torch.half) model = AutoModelForCausalLM.from_config(config) torch.set_default_dtype(torch.float) model = model.eval() layers = find_layers(model) for name in ['lm_head']: if name in layers: del layers[name] make_quant(model, layers, wbits, groupsize) del layers print('Loading model ...') if str(pt_path).endswith('.safetensors'): from safetensors.torch import load_file as safe_load if device == -1: device = "cpu" model.load_state_dict(safe_load(str(pt_path)), strict = False) else: model.load_state_dict(torch.load(str(pt_path)), strict = False) model.seqlen = 2048 print('Done.') return model def generator(job): ''' Run the job input to generate text output. ''' # Validate the input val_input = validate(job['input'], INPUT_SCHEMA) if 'errors' in val_input: return {"error": val_input['errors']} val_input = val_input['validated_input'] input_ids = tokenizer(val_input['prompt'], return_tensors="pt").input_ids.to(device) gen_tokens = model.generate( input_ids, do_sample=val_input['do_sample'], temperature=val_input['temperature'], max_length=val_input['max_length'], repetition_penalty=val_input['repetition_penalty'], top_p=val_input['top_p'], top_k=val_input['top_k'], typical_p=val_input['typical_p'], encoder_repetition_penalty=val_input['encoder_repetition_penalty'], min_length=val_input['min_length'], num_beams=val_input['num_beams'], early_stopping=val_input['early_stopping'], penalty_alpha=val_input['penalty_alpha'], length_penalty=val_input['length_penalty'], no_repeat_ngram_size=val_input['no_repeat_ngram_size'], ).to(device) gen_text = tokenizer.batch_decode(gen_tokens)[0] return gen_text # ---------------------------------------------------------------------------- # # Parse Arguments # # ---------------------------------------------------------------------------- # parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--model_name", type=str, default="gpt-neo-1.3B", help="URL of the model to download.") if __name__ == "__main__": args = parser.parse_args() # --------------------------------- Neo 1.3B --------------------------------- # if args.model_name == 'gpt-neo-1.3B': model = GPTNeoForCausalLM.from_pretrained( "EleutherAI/gpt-neo-1.3B", local_files_only=True).to(device) tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B", local_files_only=True) elif args.model_name == 'gpt-neo-2.7B': model = GPTNeoForCausalLM.from_pretrained( "EleutherAI/gpt-neo-2.7B", local_files_only=True, torch_dtype=torch.float16).to(device) tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B", local_files_only=True) elif args.model_name == 'gpt-neox-20b': model = GPTNeoXForCausalLM.from_pretrained( "EleutherAI/gpt-neox-20b", local_files_only=True).half().to(device) tokenizer = GPTNeoXTokenizerFast.from_pretrained( "EleutherAI/gpt-neox-20b", local_files_only=True) elif args.model_name == 'pygmalion-6b': model = AutoModelForCausalLM.from_pretrained( "PygmalionAI/pygmalion-6b", local_files_only=True, low_cpu_mem_usage=True).to(device) tokenizer = AutoTokenizer.from_pretrained( "PygmalionAI/pygmalion-6b", local_files_only=True) elif args.model_name == 'pygmalion-6b-4bit-128g': # model = AutoModelForCausalLM.from_pretrained( # "mayaeary/pygmalion-6b-4bit-128g", local_files_only=True).to(device) model = load_quantized("pygmalion-6b-4bit-128g", 4, 128, device).to(device) tokenizer = AutoTokenizer.from_pretrained( "mayaeary/pygmalion-6b-4bit-128g", local_files_only=True) elif args.model_name == 'pygmalion-6b-gptq-4bit': model = load_quantized("pygmalion-6b-gptq-4bit", 4, 128, device).to(device) # model = AutoModelForCausalLM.from_pretrained( # "OccamRazor/pygmalion-6b-gptq-4bit", local_files_only=True, from_pt=True).to(device) tokenizer = AutoTokenizer.from_pretrained( "OccamRazor/pygmalion-6b-gptq-4bit", local_files_only=True) elif args.model_name == 'gpt-j-6b': model = GPTJForCausalLM.from_pretrained( "EleutherAI/gpt-j-6B", local_files_only=True, revision="float16", torch_dtype=torch.float16).to(device) tokenizer = AutoTokenizer.from_pretrained( "EleutherAI/gpt-j-6B", local_files_only=True) elif args.model_name == 'ppo-shygmalion-6b': model = AutoModelForCausalLM.from_pretrained( "TehVenom/PPO_Shygmalion-6b", local_files_only=True).to(device) tokenizer = AutoTokenizer.from_pretrained( "TehVenom/PPO_Shygmalion-6b", local_files_only=True) elif args.model_name == 'dolly-shygmalion-6b': model = AutoModelForCausalLM.from_pretrained( "TehVenom/Dolly_Shygmalion-6b", local_files_only=True).to(device) tokenizer = AutoTokenizer.from_pretrained( "TehVenom/Dolly_Shygmalion-6b", local_files_only=True) elif args.model_name == 'erebus-13b': model = AutoModelForCausalLM.from_pretrained( "KoboldAI/OPT-13B-Erebus", local_files_only=True).to(device) tokenizer = AutoTokenizer.from_pretrained( "KoboldAI/OPT-13B-Erebus", local_files_only=True) elif args.model_name == 'gpt4-x-alpaca-13b-native-4bit-128g': pass # model = LlamaForCausalLM.from_pretrained( # "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g", local_files_only=True).to(device) # tokenizer = LlamaTokenizer.from_pretrained( # "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g", local_files_only=True) elif args.model_name == 'gpt4-x-alpaca': pass # model = LlamaForCausalLM.from_pretrained( # "chavinlo/gpt4-x-alpaca", local_files_only=True).to(device) # tokenizer = LlamaTokenizer.from_pretrained( # "chavinlo/gpt4-x-alpaca", local_files_only=True) elif args.model_name == 'rwkv-4-raven-7b': from RWKV import RWKVModel, RWKVTokenizer path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--*/snapshots/*/") ) found_pths = list(path_to_model.glob("*.pth")) pt_path = None if len(found_pths) == 1: pt_path = found_pts[0] else: print("Could not find the model, exiting...") exit() model = RWKVModel.from_pretrained(Path(str(pt_path)), dtype="fp16", device="cuda") tokenizer = RWKVTokenizer.from_pretrained(Path(str(path_to_model))) runpod.serverless.start({"handler": generator})