prepare RWKV

2 years ago · 316f50fdb5
5 changed files with 101 additions and 10 deletions
--- a/matrix_pygmalion_bot/ai/runpod_pygmalion.py
+++ b/matrix_pygmalion_bot/ai/runpod_pygmalion.py
@ -19,10 +19,11 @@ async def generate_sync(
    prompt: str,
    api_key: str,
    bot,
-    typing_fn
+    typing_fn,
    api_endpoint = "pygmalion-6b"
 ):
    # Set the API endpoint URL
-    endpoint = "https://api.runpod.ai/v2/pygmalion-6b/run"
+    endpoint = f"https://api.runpod.ai/v2/{api_endpoint}/run"
    # Set the headers for the request
    headers = {
@ -56,7 +57,7 @@ async def generate_sync(
        TIMEOUT = 360
        DELAY = 5
        for i in range(TIMEOUT//DELAY):
-            endpoint = "https://api.runpod.ai/v2/pygmalion-6b/status/" + job_id
+            endpoint = "https://api.runpod.ai/v2/{api_endpoint}/status/" + job_id
            r = requests.get(endpoint, headers=headers)
            r_json = r.json()
            logger.info(r_json)
--- a/runpod/runpod-worker-transformers/Dockerfile
+++ b/runpod/runpod-worker-transformers/Dockerfile
@ -55,7 +55,8 @@ RUN pip3 install --upgrade pip && \
    pip3 install safetensors && \
    pip3 install sentencepiece && \
    pip3 install diffusers && \
-    pip3 install git+https://github.com/huggingface/transformers accelerate xformers triton && \
+    pip3 install accelerate xformers triton && \
    pip3 install git+https://github.com/huggingface/transformers.git && \
    pip3 install huggingface-hub && \
    pip3 install runpod && \
    pip3 cache purge
@ -88,8 +89,9 @@ RUN pip3 install --upgrade pip && \
    pip3 install safetensors && \
    pip3 install sentencepiece && \
    pip3 install diffusers && \
-    pip3 install git+https://github.com/huggingface/transformers accelerate xformers triton && \
+    pip3 install accelerate xformers triton && \
-#    pip3 install rwkv && \
+    pip3 install git+https://github.com/huggingface/transformers.git && \
    pip3 install rwkv && \
    pip3 install huggingface-hub && \
    pip3 install runpod && \
    pip3 cache purge
@ -103,6 +105,7 @@ COPY --from=builder /root/.cache/huggingface /root/.cache/huggingface
 COPY model_fetcher.py /workspace/
 COPY runpod_infer.py /workspace/
 COPY RWKV.py /workspace/
 COPY test_input.json /workspace/
 CMD python3 -u runpod_infer.py --model_name=${MODEL_NAME}
--- a/runpod/runpod-worker-transformers/RWKV.py
+++ b/runpod/runpod-worker-transformers/RWKV.py
@ -0,0 +1,74 @@
 # https://github.com/oobabooga/text-generation-webui/blob/main/modules/RWKV.py
 import os
 from pathlib import Path
 import numpy as np
 from tokenizers import Tokenizer
 #from modules.callbacks import Iteratorize
 np.set_printoptions(precision=4, suppress=True, linewidth=200)
 os.environ['RWKV_JIT_ON'] = '1'
 os.environ["RWKV_CUDA_ON"] = '1'
 from rwkv.model import RWKV
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
 class RWKVModel:
    def __init__(self):
        pass
    @classmethod
    def from_pretrained(self, path, dtype="fp16", device="cuda"):
        tokenizer_path = Path(f"{path.parent}/20B_tokenizer.json")
        model = RWKV(model=str(path), strategy=f'{device} {dtype}')
        #model = RWKV(model=str(path), strategy='cuda fp16i8 *8 -> cuda fp16')
        pipeline = PIPELINE(model, str(tokenizer_path))
        result = self()
        result.pipeline = pipeline
        return result
    def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=None, alpha_frequency=0.1, alpha_presence=0.1, token_ban=[0], token_stop=[], callback=None):
        args = PIPELINE_ARGS(
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            alpha_frequency=alpha_frequency,  # Frequency Penalty (as in GPT-3)
            alpha_presence=alpha_presence,  # Presence Penalty (as in GPT-3)
            token_ban=token_ban,  # ban the generation of some tokens
            token_stop=token_stop
        )
        return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
 #    def generate_with_streaming(self, **kwargs):
 #        with Iteratorize(self.generate, kwargs, callback=None) as generator:
 #            reply = ''
 #            for token in generator:
 #                reply += token
 #                yield reply
 class RWKVTokenizer:
    def __init__(self):
        pass
    @classmethod
    def from_pretrained(self, path):
        tokenizer_path = path / "20B_tokenizer.json"
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
        result = self()
        result.tokenizer = tokenizer
        return result
    def encode(self, prompt):
        return self.tokenizer.encode(prompt).ids
    def decode(self, ids):
        return self.tokenizer.decode(ids)
--- a/runpod/runpod-worker-transformers/model_fetcher.py
+++ b/runpod/runpod-worker-transformers/model_fetcher.py
@ -76,7 +76,8 @@ def download_model(model_name):
    # --------------------------------- RWKV Raven 7B -------------------------------- #
    elif model_name == 'rwkv-4-raven-7b':
-        hf_hub_download(repo_id="BlinkDL/rwkv-4-raven", filename="RWKV-4-Raven-7B-v7-EngAndMore-20230404-ctx4096.pth")
+        snapshot_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-raven", filename="RWKV-4-Raven-7B-v8-Eng-20230408-ctx4096.pth")
        hf_hub_download(repo_id="BlinkDL/Raven-RWKV-7B", filename="20B_tokenizer.json", local_dir=snapshot_path)
        #https://huggingface.co/yahma/RWKV-14b_quant/resolve/main/RWKV-4-Pile-14B-20230213-8019.pqth
    if snapshot_path:
--- a/runpod/runpod-worker-transformers/runpod_infer.py
+++ b/runpod/runpod-worker-transformers/runpod_infer.py
@ -12,6 +12,9 @@ from transformers import (GPTNeoForCausalLM, GPT2Tokenizer, GPTNeoXForCausalLM,
                          AutoConfig)
 #                          LlamaForCausalLM, LlamaTokenizer)
 from pathlib import Path
 import os, sys
 torch.cuda.is_available()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@ -103,8 +106,6 @@ INPUT_SCHEMA = {
 def load_quantized(model_name, wbits, groupsize, device):
    """https://github.com/oobabooga/text-generation-webui/blob/main/modules/GPTQ_loader.py"""
    from pathlib import Path
    import os, sys
 #    os.system("mkdir repositories && git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa")
    sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
    from modelutils import find_layers
@ -301,6 +302,17 @@ if __name__ == "__main__":
 #            "chavinlo/gpt4-x-alpaca", local_files_only=True)
    elif args.model_name == 'rwkv-4-raven-7b':
-        pass
+        from RWKV import RWKVModel, RWKVTokenizer
        path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--*/snapshots/*/") )
        found_pths = list(path_to_model.glob("*.pth"))
        pt_path = None
        if len(found_pths) == 1:
            pt_path = found_pts[0]
        else:
            print("Could not find the model, exiting...")
            exit()
        model = RWKVModel.from_pretrained(Path(str(pt_path)), dtype="fp16", device="cuda")
        tokenizer = RWKVTokenizer.from_pretrained(Path(str(path_to_model)))
    runpod.serverless.start({"handler": generator})