matrix-pygmalion-bot/runpod/runpod-worker-transformers/runpod_infer.py

'''
RunPod | Transformer | Handler
'''
import argparse

import accelerate
import torch
import runpod
from runpod.serverless.utils.rp_validator import validate
from transformers import (GPTNeoForCausalLM, GPT2Tokenizer, GPTNeoXForCausalLM,
                          GPTNeoXTokenizerFast, GPTJForCausalLM, AutoTokenizer, AutoModelForCausalLM,
                          AutoConfig)
#                          LlamaForCausalLM, LlamaTokenizer)

from pathlib import Path
import os, sys


torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

INPUT_SCHEMA = {
    'prompt': {
        'type': str,
        'required': True
    },
    'do_sample': {
        'type': bool,
        'required': False,
        'default': True,
        'description': '''
            Enables decoding strategies such as multinomial sampling,
            beam-search multinomial sampling, Top-K sampling and Top-p sampling.
            All these strategies select the next token from the probability distribution
            over the entire vocabulary with various strategy-specific adjustments.
        '''
    },
    'max_length': {
        'type': int,
        'required': False,
        'default': 100
    },
    'temperature': {
        'type': float,
        'required': False,
        'default': 0.9
    },
    'repetition_penalty': {
        'type': float,
        'required': False,
        'default': 1.1
    },
    'top_p': {
        'type': float,
        'required': False,
        'default': 0.5
    },
    'top_k': {
        'type': int,
        'required': False,
        'default': 40
    },
    'typical_p': {
        'type': float,
        'required': False,
        'default': 1.0
    },
    'encoder_repetition_penalty': {
        'type': float,
        'required': False,
        'default': 1.0
    },
    'min_length': {
        'type': int,
        'required': False,
        'default': 0
    },
    'num_beams': {
        'type': int,
        'required': False,
        'default': 1
    },
    'early_stopping': {
        'type': bool,
        'required': False,
        'default': False
    },
    'penalty_alpha': {
        'type': float,
        'required': False,
        'default': 0.0
    },
    'length_penalty': {
        'type': float,
        'required': False,
        'default': 1.0
    },
    'no_repeat_ngram_size': {
        'type': int,
        'required': False,
        'default': 0
    },
    'chat_generation_attempts': {
        'type': int,
        'required': False,
        'default': 1
    },

}


def load_quantized(model_name, wbits, groupsize, device):
    """https://github.com/oobabooga/text-generation-webui/blob/main/modules/GPTQ_loader.py"""
#    os.system("mkdir repositories && git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa")
    sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
    from modelutils import find_layers
    from quant import make_quant

    import transformers
    from transformers import AutoConfig

    # Try to determine model type from model name
    name = model_name.lower()
    if any((k in name for k in ['llama', 'alpaca', 'vicuna'])):
        model_type = 'llama'
    elif any((k in name for k in ['opt-', 'galactica'])):
        model_type = 'opt'
    elif any((k in name for k in ['gpt-j', 'pygmalion-6b'])):
        model_type = 'gptj'
    else:
        print("Can't determine model type from model name."
              "argument")
        exit()

    # Now we are going to try to locate the quantized model file.
    #path_to_model = Path(f'/workdir/model')
    path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--*/snapshots/*/") )
    found_pts = list(path_to_model.glob("*.pt"))
    found_safetensors = list(path_to_model.glob("*.safetensors"))
    pt_path = None

    if len(found_pts) > 0:
        pt_path = found_pts[-1]
    elif len(found_safetensors) > 0:
        pt_path = found_safetensors[-1]
    else:
        pass

    if not pt_path:
        print("Could not find the quantized model in .pt or .safetensors format, exiting...")
        exit()

    config = AutoConfig.from_pretrained(str(path_to_model))
    def noop(*args, **kwargs):
        pass
    torch.nn.init.kaiming_uniform_ = noop
    torch.nn.init.uniform_ = noop
    torch.nn.init.normal_ = noop

    torch.set_default_dtype(torch.half)
    transformers.modeling_utils._init_weights = False
    torch.set_default_dtype(torch.half)
    model = AutoModelForCausalLM.from_config(config)
    torch.set_default_dtype(torch.float)
    model = model.eval()
    layers = find_layers(model)
    for name in ['lm_head']:
        if name in layers:
            del layers[name]
    make_quant(model, layers, wbits, groupsize)
    del layers

    print('Loading model ...')
    if str(pt_path).endswith('.safetensors'):
        from safetensors.torch import load_file as safe_load
        if device == -1:
            device = "cpu"
        model.load_state_dict(safe_load(str(pt_path)), strict = False)
    else:
        model.load_state_dict(torch.load(str(pt_path)), strict = False)
    model.seqlen = 2048
    print('Done.')

    return model


def generator(job):
    '''
    Run the job input to generate text output.
    '''
    # Validate the input
    val_input = validate(job['input'], INPUT_SCHEMA)
    if 'errors' in val_input:
        return {"error": val_input['errors']}
    val_input = val_input['validated_input']

    input_ids = tokenizer(val_input['prompt'], return_tensors="pt").input_ids.to(device)

    output = []
    for i in range(val_input['chat_generation_attempts']):
        gen_tokens = model.generate(
            input_ids,
            do_sample=val_input['do_sample'],
            temperature=val_input['temperature'],
            max_length=val_input['max_length'],
            repetition_penalty=val_input['repetition_penalty'],
            top_p=val_input['top_p'],
            top_k=val_input['top_k'],
            typical_p=val_input['typical_p'],
            encoder_repetition_penalty=val_input['encoder_repetition_penalty'],
            min_length=val_input['min_length'],
            num_beams=val_input['num_beams'],
            early_stopping=val_input['early_stopping'],
            penalty_alpha=val_input['penalty_alpha'],
            length_penalty=val_input['length_penalty'],
            no_repeat_ngram_size=val_input['no_repeat_ngram_size'],
        ).to(device)

        gen_text = tokenizer.batch_decode(gen_tokens)[0]
        if val_input['chat_generation_attempts'] == 1:
            output = gen_text
        else:
            output.append(gen_text)

    return output


# ---------------------------------------------------------------------------- #
#                                Parse Arguments                               #
# ---------------------------------------------------------------------------- #
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--model_name", type=str,
                    default="gpt-neo-1.3B", help="URL of the model to download.")


if __name__ == "__main__":
    args = parser.parse_args()

    # --------------------------------- Neo 1.3B --------------------------------- #
    if args.model_name == 'gpt-neo-1.3B':
        model = GPTNeoForCausalLM.from_pretrained(
            "EleutherAI/gpt-neo-1.3B", local_files_only=True).to(device)
        tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B", local_files_only=True)

    elif args.model_name == 'gpt-neo-2.7B':
        model = GPTNeoForCausalLM.from_pretrained(
            "EleutherAI/gpt-neo-2.7B", local_files_only=True, torch_dtype=torch.float16).to(device)
        tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B", local_files_only=True)

    elif args.model_name == 'gpt-neox-20b':
        model = GPTNeoXForCausalLM.from_pretrained(
            "EleutherAI/gpt-neox-20b", local_files_only=True).half().to(device)
        tokenizer = GPTNeoXTokenizerFast.from_pretrained(
            "EleutherAI/gpt-neox-20b", local_files_only=True)

    elif args.model_name == 'pygmalion-6b':
        model = AutoModelForCausalLM.from_pretrained(
            "PygmalionAI/pygmalion-6b", local_files_only=True, low_cpu_mem_usage=True).to(device)
        tokenizer = AutoTokenizer.from_pretrained(
            "PygmalionAI/pygmalion-6b", local_files_only=True)

    elif args.model_name == 'pygmalion-6b-4bit-128g':
#        model = AutoModelForCausalLM.from_pretrained(
#            "mayaeary/pygmalion-6b-4bit-128g", local_files_only=True).to(device)
        model = load_quantized("pygmalion-6b-4bit-128g", 4, 128, device).to(device)
        tokenizer = AutoTokenizer.from_pretrained(
            "mayaeary/pygmalion-6b-4bit-128g", local_files_only=True)

    elif args.model_name == 'pygmalion-6b-gptq-4bit':
        model = load_quantized("pygmalion-6b-gptq-4bit", 4, 128, device).to(device)
#        model = AutoModelForCausalLM.from_pretrained(
#            "OccamRazor/pygmalion-6b-gptq-4bit", local_files_only=True, from_pt=True).to(device)
        tokenizer = AutoTokenizer.from_pretrained(
            "OccamRazor/pygmalion-6b-gptq-4bit", local_files_only=True)

    elif args.model_name == 'gpt-j-6b':
        model = GPTJForCausalLM.from_pretrained(
            "EleutherAI/gpt-j-6B", local_files_only=True, revision="float16",
            torch_dtype=torch.float16).to(device)
        tokenizer = AutoTokenizer.from_pretrained(
            "EleutherAI/gpt-j-6B", local_files_only=True)

    elif args.model_name == 'ppo-shygmalion-6b':
        model = AutoModelForCausalLM.from_pretrained(
            "TehVenom/PPO_Shygmalion-6b", local_files_only=True).to(device)
        tokenizer = AutoTokenizer.from_pretrained(
            "TehVenom/PPO_Shygmalion-6b", local_files_only=True)

    elif args.model_name == 'dolly-shygmalion-6b':
        model = AutoModelForCausalLM.from_pretrained(
            "TehVenom/Dolly_Shygmalion-6b", local_files_only=True).to(device)
        tokenizer = AutoTokenizer.from_pretrained(
            "TehVenom/Dolly_Shygmalion-6b", local_files_only=True)

    elif args.model_name == 'erebus-13b':
        model = AutoModelForCausalLM.from_pretrained(
            "KoboldAI/OPT-13B-Erebus", local_files_only=True).to(device)
        tokenizer = AutoTokenizer.from_pretrained(
            "KoboldAI/OPT-13B-Erebus", local_files_only=True)

    elif args.model_name == 'gpt4-x-alpaca-13b-native-4bit-128g':
        pass
#        model = LlamaForCausalLM.from_pretrained(
#            "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g", local_files_only=True).to(device)
#        tokenizer = LlamaTokenizer.from_pretrained(
#            "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g", local_files_only=True)

    elif args.model_name == 'gpt4-x-alpaca':
        pass
#        model = LlamaForCausalLM.from_pretrained(
#            "chavinlo/gpt4-x-alpaca", local_files_only=True).to(device)
#        tokenizer = LlamaTokenizer.from_pretrained(
#            "chavinlo/gpt4-x-alpaca", local_files_only=True)

    elif args.model_name == 'rwkv-4-raven-7b':
        from RWKV import RWKVModel, RWKVTokenizer
        path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--*/snapshots/*/") )
        found_pths = list(path_to_model.glob("*.pth"))
        pt_path = None
        if len(found_pths) > 0:
            pt_path = found_pths[-1]
        else:
            print("Could not find the model, exiting...")
            exit()
        model = RWKVModel.from_pretrained(Path(str(pt_path)), dtype="fp16", device="cuda")
        tokenizer = RWKVTokenizer.from_pretrained(Path(str(path_to_model)))


    runpod.serverless.start({"handler": generator})
work on remote workers 2 years ago			`'''`
			`RunPod \| Transformer \| Handler`
			`'''`
			`import argparse`

rewrite remote transformers container 2 years ago			`import accelerate`
work on remote workers 2 years ago			`import torch`
			`import runpod`
			`from runpod.serverless.utils.rp_validator import validate`
			`from transformers import (GPTNeoForCausalLM, GPT2Tokenizer, GPTNeoXForCausalLM,`
rewrite remote transformers container 2 years ago			`GPTNeoXTokenizerFast, GPTJForCausalLM, AutoTokenizer, AutoModelForCausalLM,`
			`AutoConfig)`
test 2 years ago			`# LlamaForCausalLM, LlamaTokenizer)`
work on remote workers 2 years ago
prepare RWKV 2 years ago			`from pathlib import Path`
			`import os, sys`

work on remote workers 2 years ago
			`torch.cuda.is_available()`
			`device = torch.device("cuda" if torch.cuda.is_available() else "cpu")`

			`INPUT_SCHEMA = {`
			`'prompt': {`
			`'type': str,`
			`'required': True`
			`},`
			`'do_sample': {`
			`'type': bool,`
			`'required': False,`
			`'default': True,`
			`'description': '''`
			`Enables decoding strategies such as multinomial sampling,`
			`beam-search multinomial sampling, Top-K sampling and Top-p sampling.`
			`All these strategies select the next token from the probability distribution`
			`over the entire vocabulary with various strategy-specific adjustments.`
			`'''`
			`},`
			`'max_length': {`
			`'type': int,`
			`'required': False,`
			`'default': 100`
			`},`
			`'temperature': {`
			`'type': float,`
			`'required': False,`
			`'default': 0.9`
			`},`
			`'repetition_penalty': {`
			`'type': float,`
			`'required': False,`
			`'default': 1.1`
			`},`
			`'top_p': {`
			`'type': float,`
			`'required': False,`
			`'default': 0.5`
			`},`
			`'top_k': {`
			`'type': int,`
			`'required': False,`
			`'default': 40`
			`},`
			`'typical_p': {`
			`'type': float,`
			`'required': False,`
			`'default': 1.0`
			`},`
			`'encoder_repetition_penalty': {`
			`'type': float,`
			`'required': False,`
			`'default': 1.0`
			`},`
			`'min_length': {`
			`'type': int,`
			`'required': False,`
			`'default': 0`
			`},`
			`'num_beams': {`
			`'type': int,`
			`'required': False,`
			`'default': 1`
			`},`
			`'early_stopping': {`
			`'type': bool,`
			`'required': False,`
			`'default': False`
			`},`
			`'penalty_alpha': {`
			`'type': float,`
			`'required': False,`
			`'default': 0.0`
			`},`
			`'length_penalty': {`
			`'type': float,`
			`'required': False,`
			`'default': 1.0`
			`},`
			`'no_repeat_ngram_size': {`
			`'type': int,`
			`'required': False,`
			`'default': 0`
			`},`
generate multiple answers 2 years ago			`'chat_generation_attempts': {`
			`'type': int,`
			`'required': False,`
			`'default': 1`
			`},`
work on remote workers 2 years ago
			`}`


more tests on remote worker container 2 years ago			`def load_quantized(model_name, wbits, groupsize, device):`
rewrite remote transformers container 2 years ago			`"""https://github.com/oobabooga/text-generation-webui/blob/main/modules/GPTQ_loader.py"""`
			`# os.system("mkdir repositories && git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa")`
			`sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))`
			`from modelutils import find_layers`
			`from quant import make_quant`

			`import transformers`
			`from transformers import AutoConfig`

			`# Try to determine model type from model name`
			`name = model_name.lower()`
			`if any((k in name for k in ['llama', 'alpaca', 'vicuna'])):`
			`model_type = 'llama'`
			`elif any((k in name for k in ['opt-', 'galactica'])):`
			`model_type = 'opt'`
			`elif any((k in name for k in ['gpt-j', 'pygmalion-6b'])):`
			`model_type = 'gptj'`
			`else:`
			`print("Can't determine model type from model name."`
			`"argument")`
			`exit()`

			`# Now we are going to try to locate the quantized model file.`
			`#path_to_model = Path(f'/workdir/model')`
			`path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--/snapshots//") )`
			`found_pts = list(path_to_model.glob("*.pt"))`
			`found_safetensors = list(path_to_model.glob("*.safetensors"))`
			`pt_path = None`

more robust model file search 2 years ago			`if len(found_pts) > 0:`
			`pt_path = found_pts[-1]`
			`elif len(found_safetensors) > 0:`
			`pt_path = found_safetensors[-1]`
rewrite remote transformers container 2 years ago			`else:`
			`pass`

			`if not pt_path:`
			`print("Could not find the quantized model in .pt or .safetensors format, exiting...")`
			`exit()`

			`config = AutoConfig.from_pretrained(str(path_to_model))`
			`def noop(args, *kwargs):`
			`pass`
			`torch.nn.init.kaiming_uniform_ = noop`
			`torch.nn.init.uniform_ = noop`
			`torch.nn.init.normal_ = noop`

			`torch.set_default_dtype(torch.half)`
			`transformers.modeling_utils._init_weights = False`
			`torch.set_default_dtype(torch.half)`
			`model = AutoModelForCausalLM.from_config(config)`
			`torch.set_default_dtype(torch.float)`
			`model = model.eval()`
			`layers = find_layers(model)`
			`for name in ['lm_head']:`
			`if name in layers:`
			`del layers[name]`
			`make_quant(model, layers, wbits, groupsize)`
			`del layers`

			`print('Loading model ...')`
			`if str(pt_path).endswith('.safetensors'):`
			`from safetensors.torch import load_file as safe_load`
more tests on remote worker container 2 years ago			`if device == -1:`
			`device = "cpu"`
remote worker 2 years ago			`model.load_state_dict(safe_load(str(pt_path)), strict = False)`
rewrite remote transformers container 2 years ago			`else:`
more tests on the remote workers 2 years ago			`model.load_state_dict(torch.load(str(pt_path)), strict = False)`
rewrite remote transformers container 2 years ago			`model.seqlen = 2048`
			`print('Done.')`

			`return model`


work on remote workers 2 years ago			`def generator(job):`
			`'''`
			`Run the job input to generate text output.`
			`'''`
			`# Validate the input`
			`val_input = validate(job['input'], INPUT_SCHEMA)`
			`if 'errors' in val_input:`
			`return {"error": val_input['errors']}`
			`val_input = val_input['validated_input']`

			`input_ids = tokenizer(val_input['prompt'], return_tensors="pt").input_ids.to(device)`

generate multiple answers 2 years ago			`output = []`
			`for i in range(val_input['chat_generation_attempts']):`
			`gen_tokens = model.generate(`
			`input_ids,`
			`do_sample=val_input['do_sample'],`
			`temperature=val_input['temperature'],`
			`max_length=val_input['max_length'],`
			`repetition_penalty=val_input['repetition_penalty'],`
			`top_p=val_input['top_p'],`
			`top_k=val_input['top_k'],`
			`typical_p=val_input['typical_p'],`
			`encoder_repetition_penalty=val_input['encoder_repetition_penalty'],`
			`min_length=val_input['min_length'],`
			`num_beams=val_input['num_beams'],`
			`early_stopping=val_input['early_stopping'],`
			`penalty_alpha=val_input['penalty_alpha'],`
			`length_penalty=val_input['length_penalty'],`
			`no_repeat_ngram_size=val_input['no_repeat_ngram_size'],`
			`).to(device)`

			`gen_text = tokenizer.batch_decode(gen_tokens)[0]`
			`if val_input['chat_generation_attempts'] == 1:`
			`output = gen_text`
			`else:`
			`output.append(gen_text)`

			`return output`
work on remote workers 2 years ago

			`# ---------------------------------------------------------------------------- #`
			`# Parse Arguments #`
			`# ---------------------------------------------------------------------------- #`
			`parser = argparse.ArgumentParser(description=__doc__)`
			`parser.add_argument("--model_name", type=str,`
			`default="gpt-neo-1.3B", help="URL of the model to download.")`


			`if __name__ == "__main__":`
			`args = parser.parse_args()`

			`# --------------------------------- Neo 1.3B --------------------------------- #`
			`if args.model_name == 'gpt-neo-1.3B':`
			`model = GPTNeoForCausalLM.from_pretrained(`
			`"EleutherAI/gpt-neo-1.3B", local_files_only=True).to(device)`
			`tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B", local_files_only=True)`

			`elif args.model_name == 'gpt-neo-2.7B':`
			`model = GPTNeoForCausalLM.from_pretrained(`
			`"EleutherAI/gpt-neo-2.7B", local_files_only=True, torch_dtype=torch.float16).to(device)`
			`tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B", local_files_only=True)`

			`elif args.model_name == 'gpt-neox-20b':`
			`model = GPTNeoXForCausalLM.from_pretrained(`
			`"EleutherAI/gpt-neox-20b", local_files_only=True).half().to(device)`
			`tokenizer = GPTNeoXTokenizerFast.from_pretrained(`
			`"EleutherAI/gpt-neox-20b", local_files_only=True)`

			`elif args.model_name == 'pygmalion-6b':`
			`model = AutoModelForCausalLM.from_pretrained(`
test 2 years ago			`"PygmalionAI/pygmalion-6b", local_files_only=True, low_cpu_mem_usage=True).to(device)`
work on remote workers 2 years ago			`tokenizer = AutoTokenizer.from_pretrained(`
			`"PygmalionAI/pygmalion-6b", local_files_only=True)`

test 2 years ago			`elif args.model_name == 'pygmalion-6b-4bit-128g':`
rewrite remote transformers container 2 years ago			`# model = AutoModelForCausalLM.from_pretrained(`
			`# "mayaeary/pygmalion-6b-4bit-128g", local_files_only=True).to(device)`
more tests on remote worker container 2 years ago			`model = load_quantized("pygmalion-6b-4bit-128g", 4, 128, device).to(device)`
test 2 years ago			`tokenizer = AutoTokenizer.from_pretrained(`
			`"mayaeary/pygmalion-6b-4bit-128g", local_files_only=True)`

			`elif args.model_name == 'pygmalion-6b-gptq-4bit':`
try other quantized model 2 years ago			`model = load_quantized("pygmalion-6b-gptq-4bit", 4, 128, device).to(device)`
			`# model = AutoModelForCausalLM.from_pretrained(`
			`# "OccamRazor/pygmalion-6b-gptq-4bit", local_files_only=True, from_pt=True).to(device)`
test 2 years ago			`tokenizer = AutoTokenizer.from_pretrained(`
			`"OccamRazor/pygmalion-6b-gptq-4bit", local_files_only=True)`

work on remote workers 2 years ago			`elif args.model_name == 'gpt-j-6b':`
			`model = GPTJForCausalLM.from_pretrained(`
			`"EleutherAI/gpt-j-6B", local_files_only=True, revision="float16",`
			`torch_dtype=torch.float16).to(device)`
			`tokenizer = AutoTokenizer.from_pretrained(`
			`"EleutherAI/gpt-j-6B", local_files_only=True)`

add experimental and wip worker containers 2 years ago			`elif args.model_name == 'ppo-shygmalion-6b':`
work on remote workers 2 years ago			`model = AutoModelForCausalLM.from_pretrained(`
			`"TehVenom/PPO_Shygmalion-6b", local_files_only=True).to(device)`
			`tokenizer = AutoTokenizer.from_pretrained(`
			`"TehVenom/PPO_Shygmalion-6b", local_files_only=True)`

add experimental and wip worker containers 2 years ago			`elif args.model_name == 'dolly-shygmalion-6b':`
			`model = AutoModelForCausalLM.from_pretrained(`
			`"TehVenom/Dolly_Shygmalion-6b", local_files_only=True).to(device)`
			`tokenizer = AutoTokenizer.from_pretrained(`
			`"TehVenom/Dolly_Shygmalion-6b", local_files_only=True)`

work on remote workers 2 years ago			`elif args.model_name == 'erebus-13b':`
			`model = AutoModelForCausalLM.from_pretrained(`
			`"KoboldAI/OPT-13B-Erebus", local_files_only=True).to(device)`
			`tokenizer = AutoTokenizer.from_pretrained(`
			`"KoboldAI/OPT-13B-Erebus", local_files_only=True)`

add experimental and wip worker containers 2 years ago			`elif args.model_name == 'gpt4-x-alpaca-13b-native-4bit-128g':`
test 2 years ago			`pass`
			`# model = LlamaForCausalLM.from_pretrained(`
			`# "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g", local_files_only=True).to(device)`
			`# tokenizer = LlamaTokenizer.from_pretrained(`
			`# "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g", local_files_only=True)`
add experimental and wip worker containers 2 years ago
			`elif args.model_name == 'gpt4-x-alpaca':`
test 2 years ago			`pass`
			`# model = LlamaForCausalLM.from_pretrained(`
			`# "chavinlo/gpt4-x-alpaca", local_files_only=True).to(device)`
			`# tokenizer = LlamaTokenizer.from_pretrained(`
			`# "chavinlo/gpt4-x-alpaca", local_files_only=True)`

			`elif args.model_name == 'rwkv-4-raven-7b':`
prepare RWKV 2 years ago			`from RWKV import RWKVModel, RWKVTokenizer`
			`path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--/snapshots//") )`
			`found_pths = list(path_to_model.glob("*.pth"))`
			`pt_path = None`
more robust model file search 2 years ago			`if len(found_pths) > 0:`
			`pt_path = found_pths[-1]`
prepare RWKV 2 years ago			`else:`
			`print("Could not find the model, exiting...")`
			`exit()`
			`model = RWKVModel.from_pretrained(Path(str(pt_path)), dtype="fp16", device="cuda")`
			`tokenizer = RWKVTokenizer.from_pretrained(Path(str(path_to_model)))`

add experimental and wip worker containers 2 years ago
work on remote workers 2 years ago			`runpod.serverless.start({"handler": generator})`