matrix-pygmalion-bot/runpod/runpod-worker-transformers/runpod_infer.py


								'''

								RunPod | Transformer | Handler

								'''

								import argparse


								import accelerate

								import torch

								import runpod

								from runpod.serverless.utils.rp_validator import validate

								from transformers import (GPTNeoForCausalLM, GPT2Tokenizer, GPTNeoXForCausalLM,

								                          GPTNeoXTokenizerFast, GPTJForCausalLM, AutoTokenizer, AutoModelForCausalLM,

								                          AutoConfig)

								#                          LlamaForCausalLM, LlamaTokenizer)


								from pathlib import Path

								import os, sys


								torch.cuda.is_available()

								device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


								INPUT_SCHEMA = {

								    'prompt': {

								        'type': str,

								        'required': True

								    },

								    'do_sample': {

								        'type': bool,

								        'required': False,

								        'default': True,

								        'description': '''

								            Enables decoding strategies such as multinomial sampling,

								            beam-search multinomial sampling, Top-K sampling and Top-p sampling.

								            All these strategies select the next token from the probability distribution

								            over the entire vocabulary with various strategy-specific adjustments.

								        '''

								    },

								    'max_length': {

								        'type': int,

								        'required': False,

								        'default': 100

								    },

								    'temperature': {

								        'type': float,

								        'required': False,

								        'default': 0.9

								    },

								    'repetition_penalty': {

								        'type': float,

								        'required': False,

								        'default': 1.1

								    },

								    'top_p': {

								        'type': float,

								        'required': False,

								        'default': 0.5

								    },

								    'top_k': {

								        'type': int,

								        'required': False,

								        'default': 40

								    },

								    'typical_p': {

								        'type': float,

								        'required': False,

								        'default': 1.0

								    },

								    'encoder_repetition_penalty': {

								        'type': float,

								        'required': False,

								        'default': 1.0

								    },

								    'min_length': {

								        'type': int,

								        'required': False,

								        'default': 0

								    },

								    'num_beams': {

								        'type': int,

								        'required': False,

								        'default': 1

								    },

								    'early_stopping': {

								        'type': bool,

								        'required': False,

								        'default': False

								    },

								    'penalty_alpha': {

								        'type': float,

								        'required': False,

								        'default': 0.0

								    },

								    'length_penalty': {

								        'type': float,

								        'required': False,

								        'default': 1.0

								    },

								    'no_repeat_ngram_size': {

								        'type': int,

								        'required': False,

								        'default': 0

								    },


								}


								def load_quantized(model_name, wbits, groupsize, device):

								    """https://github.com/oobabooga/text-generation-webui/blob/main/modules/GPTQ_loader.py"""

								#    os.system("mkdir repositories && git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa")

								    sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))

								    from modelutils import find_layers

								    from quant import make_quant


								    import transformers

								    from transformers import AutoConfig


								    # Try to determine model type from model name

								    name = model_name.lower()

								    if any((k in name for k in ['llama', 'alpaca', 'vicuna'])):

								        model_type = 'llama'

								    elif any((k in name for k in ['opt-', 'galactica'])):

								        model_type = 'opt'

								    elif any((k in name for k in ['gpt-j', 'pygmalion-6b'])):

								        model_type = 'gptj'

								    else:

								        print("Can't determine model type from model name."

								              "argument")

								        exit()


								    # Now we are going to try to locate the quantized model file.

								    #path_to_model = Path(f'/workdir/model')

								    path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--*/snapshots/*/") )

								    found_pts = list(path_to_model.glob("*.pt"))

								    found_safetensors = list(path_to_model.glob("*.safetensors"))

								    pt_path = None


								    if len(found_pts) == 1:

								        pt_path = found_pts[0]

								    elif len(found_safetensors) == 1:

								        pt_path = found_safetensors[0]

								    else:

								        pass


								    if not pt_path:

								        print("Could not find the quantized model in .pt or .safetensors format, exiting...")

								        exit()


								    config = AutoConfig.from_pretrained(str(path_to_model))

								    def noop(*args, **kwargs):

								        pass

								    torch.nn.init.kaiming_uniform_ = noop

								    torch.nn.init.uniform_ = noop

								    torch.nn.init.normal_ = noop


								    torch.set_default_dtype(torch.half)

								    transformers.modeling_utils._init_weights = False

								    torch.set_default_dtype(torch.half)

								    model = AutoModelForCausalLM.from_config(config)

								    torch.set_default_dtype(torch.float)

								    model = model.eval()

								    layers = find_layers(model)

								    for name in ['lm_head']:

								        if name in layers:

								            del layers[name]

								    make_quant(model, layers, wbits, groupsize)

								    del layers


								    print('Loading model ...')

								    if str(pt_path).endswith('.safetensors'):

								        from safetensors.torch import load_file as safe_load

								        if device == -1:

								            device = "cpu"

								        model.load_state_dict(safe_load(str(pt_path)), strict = False)

								    else:

								        model.load_state_dict(torch.load(str(pt_path)), strict = False)

								    model.seqlen = 2048

								    print('Done.')


								    return model


								def generator(job):

								    '''

								    Run the job input to generate text output.

								    '''

								    # Validate the input

								    val_input = validate(job['input'], INPUT_SCHEMA)

								    if 'errors' in val_input:

								        return {"error": val_input['errors']}

								    val_input = val_input['validated_input']


								    input_ids = tokenizer(val_input['prompt'], return_tensors="pt").input_ids.to(device)


								    gen_tokens = model.generate(

								        input_ids,

								        do_sample=val_input['do_sample'],

								        temperature=val_input['temperature'],

								        max_length=val_input['max_length'],

								        repetition_penalty=val_input['repetition_penalty'],

								        top_p=val_input['top_p'],

								        top_k=val_input['top_k'],

								        typical_p=val_input['typical_p'],

								        encoder_repetition_penalty=val_input['encoder_repetition_penalty'],

								        min_length=val_input['min_length'],

								        num_beams=val_input['num_beams'],

								        early_stopping=val_input['early_stopping'],

								        penalty_alpha=val_input['penalty_alpha'],

								        length_penalty=val_input['length_penalty'],

								        no_repeat_ngram_size=val_input['no_repeat_ngram_size'],

								    ).to(device)


								    gen_text = tokenizer.batch_decode(gen_tokens)[0]


								    return gen_text


								# ---------------------------------------------------------------------------- #

								#                                Parse Arguments                               #

								# ---------------------------------------------------------------------------- #

								parser = argparse.ArgumentParser(description=__doc__)

								parser.add_argument("--model_name", type=str,

								                    default="gpt-neo-1.3B", help="URL of the model to download.")


								if __name__ == "__main__":

								    args = parser.parse_args()


								    # --------------------------------- Neo 1.3B --------------------------------- #

								    if args.model_name == 'gpt-neo-1.3B':

								        model = GPTNeoForCausalLM.from_pretrained(

								            "EleutherAI/gpt-neo-1.3B", local_files_only=True).to(device)

								        tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B", local_files_only=True)


								    elif args.model_name == 'gpt-neo-2.7B':

								        model = GPTNeoForCausalLM.from_pretrained(

								            "EleutherAI/gpt-neo-2.7B", local_files_only=True, torch_dtype=torch.float16).to(device)

								        tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B", local_files_only=True)


								    elif args.model_name == 'gpt-neox-20b':

								        model = GPTNeoXForCausalLM.from_pretrained(

								            "EleutherAI/gpt-neox-20b", local_files_only=True).half().to(device)

								        tokenizer = GPTNeoXTokenizerFast.from_pretrained(

								            "EleutherAI/gpt-neox-20b", local_files_only=True)


								    elif args.model_name == 'pygmalion-6b':

								        model = AutoModelForCausalLM.from_pretrained(

								            "PygmalionAI/pygmalion-6b", local_files_only=True, low_cpu_mem_usage=True).to(device)

								        tokenizer = AutoTokenizer.from_pretrained(

								            "PygmalionAI/pygmalion-6b", local_files_only=True)


								    elif args.model_name == 'pygmalion-6b-4bit-128g':

								#        model = AutoModelForCausalLM.from_pretrained(

								#            "mayaeary/pygmalion-6b-4bit-128g", local_files_only=True).to(device)

								        model = load_quantized("pygmalion-6b-4bit-128g", 4, 128, device).to(device)

								        tokenizer = AutoTokenizer.from_pretrained(

								            "mayaeary/pygmalion-6b-4bit-128g", local_files_only=True)


								    elif args.model_name == 'pygmalion-6b-gptq-4bit':

								        model = load_quantized("pygmalion-6b-gptq-4bit", 4, 128, device).to(device)

								#        model = AutoModelForCausalLM.from_pretrained(

								#            "OccamRazor/pygmalion-6b-gptq-4bit", local_files_only=True, from_pt=True).to(device)

								        tokenizer = AutoTokenizer.from_pretrained(

								            "OccamRazor/pygmalion-6b-gptq-4bit", local_files_only=True)


								    elif args.model_name == 'gpt-j-6b':

								        model = GPTJForCausalLM.from_pretrained(

								            "EleutherAI/gpt-j-6B", local_files_only=True, revision="float16",

								            torch_dtype=torch.float16).to(device)

								        tokenizer = AutoTokenizer.from_pretrained(

								            "EleutherAI/gpt-j-6B", local_files_only=True)


								    elif args.model_name == 'ppo-shygmalion-6b':

								        model = AutoModelForCausalLM.from_pretrained(

								            "TehVenom/PPO_Shygmalion-6b", local_files_only=True).to(device)

								        tokenizer = AutoTokenizer.from_pretrained(

								            "TehVenom/PPO_Shygmalion-6b", local_files_only=True)


								    elif args.model_name == 'dolly-shygmalion-6b':

								        model = AutoModelForCausalLM.from_pretrained(

								            "TehVenom/Dolly_Shygmalion-6b", local_files_only=True).to(device)

								        tokenizer = AutoTokenizer.from_pretrained(

								            "TehVenom/Dolly_Shygmalion-6b", local_files_only=True)


								    elif args.model_name == 'erebus-13b':

								        model = AutoModelForCausalLM.from_pretrained(

								            "KoboldAI/OPT-13B-Erebus", local_files_only=True).to(device)

								        tokenizer = AutoTokenizer.from_pretrained(

								            "KoboldAI/OPT-13B-Erebus", local_files_only=True)


								    elif args.model_name == 'gpt4-x-alpaca-13b-native-4bit-128g':

								        pass

								#        model = LlamaForCausalLM.from_pretrained(

								#            "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g", local_files_only=True).to(device)

								#        tokenizer = LlamaTokenizer.from_pretrained(

								#            "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g", local_files_only=True)


								    elif args.model_name == 'gpt4-x-alpaca':

								        pass

								#        model = LlamaForCausalLM.from_pretrained(

								#            "chavinlo/gpt4-x-alpaca", local_files_only=True).to(device)

								#        tokenizer = LlamaTokenizer.from_pretrained(

								#            "chavinlo/gpt4-x-alpaca", local_files_only=True)


								    elif args.model_name == 'rwkv-4-raven-7b':

								        from RWKV import RWKVModel, RWKVTokenizer

								        path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--*/snapshots/*/") )

								        found_pths = list(path_to_model.glob("*.pth"))

								        pt_path = None

								        if len(found_pths) == 1:

								            pt_path = found_pts[0]

								        else:

								            print("Could not find the model, exiting...")

								            exit()

								        model = RWKVModel.from_pretrained(Path(str(pt_path)), dtype="fp16", device="cuda")

								        tokenizer = RWKVTokenizer.from_pretrained(Path(str(path_to_model)))


								    runpod.serverless.start({"handler": generator})