diff --git a/matrix_pygmalion_bot/ai/runpod_pygmalion.py b/matrix_pygmalion_bot/ai/runpod_pygmalion.py index e2cb9e7..9813486 100644 --- a/matrix_pygmalion_bot/ai/runpod_pygmalion.py +++ b/matrix_pygmalion_bot/ai/runpod_pygmalion.py @@ -19,10 +19,11 @@ async def generate_sync( prompt: str, api_key: str, bot, - typing_fn + typing_fn, + api_endpoint = "pygmalion-6b" ): # Set the API endpoint URL - endpoint = "https://api.runpod.ai/v2/pygmalion-6b/run" + endpoint = f"https://api.runpod.ai/v2/{api_endpoint}/run" # Set the headers for the request headers = { @@ -56,7 +57,7 @@ async def generate_sync( TIMEOUT = 360 DELAY = 5 for i in range(TIMEOUT//DELAY): - endpoint = "https://api.runpod.ai/v2/pygmalion-6b/status/" + job_id + endpoint = "https://api.runpod.ai/v2/{api_endpoint}/status/" + job_id r = requests.get(endpoint, headers=headers) r_json = r.json() logger.info(r_json) diff --git a/runpod/runpod-worker-transformers/Dockerfile b/runpod/runpod-worker-transformers/Dockerfile index 5a6cc39..0800ba2 100644 --- a/runpod/runpod-worker-transformers/Dockerfile +++ b/runpod/runpod-worker-transformers/Dockerfile @@ -55,7 +55,8 @@ RUN pip3 install --upgrade pip && \ pip3 install safetensors && \ pip3 install sentencepiece && \ pip3 install diffusers && \ - pip3 install git+https://github.com/huggingface/transformers accelerate xformers triton && \ + pip3 install accelerate xformers triton && \ + pip3 install git+https://github.com/huggingface/transformers.git && \ pip3 install huggingface-hub && \ pip3 install runpod && \ pip3 cache purge @@ -88,8 +89,9 @@ RUN pip3 install --upgrade pip && \ pip3 install safetensors && \ pip3 install sentencepiece && \ pip3 install diffusers && \ - pip3 install git+https://github.com/huggingface/transformers accelerate xformers triton && \ -# pip3 install rwkv && \ + pip3 install accelerate xformers triton && \ + pip3 install git+https://github.com/huggingface/transformers.git && \ + pip3 install rwkv && \ pip3 install huggingface-hub && \ pip3 install runpod && \ pip3 cache purge @@ -103,6 +105,7 @@ COPY --from=builder /root/.cache/huggingface /root/.cache/huggingface COPY model_fetcher.py /workspace/ COPY runpod_infer.py /workspace/ +COPY RWKV.py /workspace/ COPY test_input.json /workspace/ CMD python3 -u runpod_infer.py --model_name=${MODEL_NAME} diff --git a/runpod/runpod-worker-transformers/RWKV.py b/runpod/runpod-worker-transformers/RWKV.py new file mode 100644 index 0000000..21bfcbf --- /dev/null +++ b/runpod/runpod-worker-transformers/RWKV.py @@ -0,0 +1,74 @@ +# https://github.com/oobabooga/text-generation-webui/blob/main/modules/RWKV.py +import os +from pathlib import Path + +import numpy as np +from tokenizers import Tokenizer + +#from modules.callbacks import Iteratorize + +np.set_printoptions(precision=4, suppress=True, linewidth=200) + +os.environ['RWKV_JIT_ON'] = '1' +os.environ["RWKV_CUDA_ON"] = '1' + +from rwkv.model import RWKV +from rwkv.utils import PIPELINE, PIPELINE_ARGS + + +class RWKVModel: + def __init__(self): + pass + + @classmethod + def from_pretrained(self, path, dtype="fp16", device="cuda"): + tokenizer_path = Path(f"{path.parent}/20B_tokenizer.json") + + model = RWKV(model=str(path), strategy=f'{device} {dtype}') + #model = RWKV(model=str(path), strategy='cuda fp16i8 *8 -> cuda fp16') + pipeline = PIPELINE(model, str(tokenizer_path)) + + result = self() + result.pipeline = pipeline + return result + + def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=None, alpha_frequency=0.1, alpha_presence=0.1, token_ban=[0], token_stop=[], callback=None): + args = PIPELINE_ARGS( + temperature=temperature, + top_p=top_p, + top_k=top_k, + alpha_frequency=alpha_frequency, # Frequency Penalty (as in GPT-3) + alpha_presence=alpha_presence, # Presence Penalty (as in GPT-3) + token_ban=token_ban, # ban the generation of some tokens + token_stop=token_stop + ) + + return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback) + +# def generate_with_streaming(self, **kwargs): +# with Iteratorize(self.generate, kwargs, callback=None) as generator: +# reply = '' +# for token in generator: +# reply += token +# yield reply + + +class RWKVTokenizer: + def __init__(self): + pass + + @classmethod + def from_pretrained(self, path): + tokenizer_path = path / "20B_tokenizer.json" + tokenizer = Tokenizer.from_file(str(tokenizer_path)) + + result = self() + result.tokenizer = tokenizer + return result + + def encode(self, prompt): + return self.tokenizer.encode(prompt).ids + + def decode(self, ids): + return self.tokenizer.decode(ids) + diff --git a/runpod/runpod-worker-transformers/model_fetcher.py b/runpod/runpod-worker-transformers/model_fetcher.py index 28f3086..b8244fb 100644 --- a/runpod/runpod-worker-transformers/model_fetcher.py +++ b/runpod/runpod-worker-transformers/model_fetcher.py @@ -76,7 +76,8 @@ def download_model(model_name): # --------------------------------- RWKV Raven 7B -------------------------------- # elif model_name == 'rwkv-4-raven-7b': - hf_hub_download(repo_id="BlinkDL/rwkv-4-raven", filename="RWKV-4-Raven-7B-v7-EngAndMore-20230404-ctx4096.pth") + snapshot_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-raven", filename="RWKV-4-Raven-7B-v8-Eng-20230408-ctx4096.pth") + hf_hub_download(repo_id="BlinkDL/Raven-RWKV-7B", filename="20B_tokenizer.json", local_dir=snapshot_path) #https://huggingface.co/yahma/RWKV-14b_quant/resolve/main/RWKV-4-Pile-14B-20230213-8019.pqth if snapshot_path: diff --git a/runpod/runpod-worker-transformers/runpod_infer.py b/runpod/runpod-worker-transformers/runpod_infer.py index 48f73cd..1ef780a 100644 --- a/runpod/runpod-worker-transformers/runpod_infer.py +++ b/runpod/runpod-worker-transformers/runpod_infer.py @@ -12,6 +12,9 @@ from transformers import (GPTNeoForCausalLM, GPT2Tokenizer, GPTNeoXForCausalLM, AutoConfig) # LlamaForCausalLM, LlamaTokenizer) +from pathlib import Path +import os, sys + torch.cuda.is_available() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -103,8 +106,6 @@ INPUT_SCHEMA = { def load_quantized(model_name, wbits, groupsize, device): """https://github.com/oobabooga/text-generation-webui/blob/main/modules/GPTQ_loader.py""" - from pathlib import Path - import os, sys # os.system("mkdir repositories && git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa") sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa"))) from modelutils import find_layers @@ -301,6 +302,17 @@ if __name__ == "__main__": # "chavinlo/gpt4-x-alpaca", local_files_only=True) elif args.model_name == 'rwkv-4-raven-7b': - pass + from RWKV import RWKVModel, RWKVTokenizer + path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--*/snapshots/*/") ) + found_pths = list(path_to_model.glob("*.pth")) + pt_path = None + if len(found_pths) == 1: + pt_path = found_pts[0] + else: + print("Could not find the model, exiting...") + exit() + model = RWKVModel.from_pretrained(Path(str(pt_path)), dtype="fp16", device="cuda") + tokenizer = RWKVTokenizer.from_pretrained(Path(str(path_to_model))) + runpod.serverless.start({"handler": generator})