prepare RWKV

2 years ago · 316f50fdb5
5 changed files with 101 additions and 10 deletions
--- a/matrix_pygmalion_bot/ai/runpod_pygmalion.py
+++ b/matrix_pygmalion_bot/ai/runpod_pygmalion.py
@ -19,10 +19,11 @@ async def generate_sync(
    prompt: str,
    api_key: str,
    bot,
-    typing_fn
+    typing_fn,
+    api_endpoint = "pygmalion-6b"
 ):
    # Set the API endpoint URL
-    endpoint = "https://api.runpod.ai/v2/pygmalion-6b/run"
+    endpoint = f"https://api.runpod.ai/v2/{api_endpoint}/run"

    # Set the headers for the request
    headers = {
@ -56,7 +57,7 @@ async def generate_sync(
        TIMEOUT = 360
        DELAY = 5
        for i in range(TIMEOUT//DELAY):
-            endpoint = "https://api.runpod.ai/v2/pygmalion-6b/status/" + job_id
+            endpoint = "https://api.runpod.ai/v2/{api_endpoint}/status/" + job_id
            r = requests.get(endpoint, headers=headers)
            r_json = r.json()
            logger.info(r_json)
--- a/runpod/runpod-worker-transformers/Dockerfile
+++ b/runpod/runpod-worker-transformers/Dockerfile
@ -55,7 +55,8 @@ RUN pip3 install --upgrade pip && \
    pip3 install safetensors && \
    pip3 install sentencepiece && \
    pip3 install diffusers && \
-    pip3 install git+https://github.com/huggingface/transformers accelerate xformers triton && \
+    pip3 install accelerate xformers triton && \
+    pip3 install git+https://github.com/huggingface/transformers.git && \
    pip3 install huggingface-hub && \
    pip3 install runpod && \
    pip3 cache purge
@ -88,8 +89,9 @@ RUN pip3 install --upgrade pip && \
    pip3 install safetensors && \
    pip3 install sentencepiece && \
    pip3 install diffusers && \
-    pip3 install git+https://github.com/huggingface/transformers accelerate xformers triton && \
-#    pip3 install rwkv && \
+    pip3 install accelerate xformers triton && \
+    pip3 install git+https://github.com/huggingface/transformers.git && \
+    pip3 install rwkv && \
    pip3 install huggingface-hub && \
    pip3 install runpod && \
    pip3 cache purge
@ -103,6 +105,7 @@ COPY --from=builder /root/.cache/huggingface /root/.cache/huggingface

 COPY model_fetcher.py /workspace/
 COPY runpod_infer.py /workspace/
+COPY RWKV.py /workspace/
 COPY test_input.json /workspace/

 CMD python3 -u runpod_infer.py --model_name=${MODEL_NAME}
--- a/runpod/runpod-worker-transformers/RWKV.py
+++ b/runpod/runpod-worker-transformers/RWKV.py
@ -0,0 +1,74 @@
+# https://github.com/oobabooga/text-generation-webui/blob/main/modules/RWKV.py
+import os
+from pathlib import Path
+
+import numpy as np
+from tokenizers import Tokenizer
+
+#from modules.callbacks import Iteratorize
+
+np.set_printoptions(precision=4, suppress=True, linewidth=200)
+
+os.environ['RWKV_JIT_ON'] = '1'
+os.environ["RWKV_CUDA_ON"] = '1'
+
+from rwkv.model import RWKV
+from rwkv.utils import PIPELINE, PIPELINE_ARGS
+
+
+class RWKVModel:
+    def __init__(self):
+        pass
+
+    @classmethod
+    def from_pretrained(self, path, dtype="fp16", device="cuda"):
+        tokenizer_path = Path(f"{path.parent}/20B_tokenizer.json")
+
+        model = RWKV(model=str(path), strategy=f'{device} {dtype}')
+        #model = RWKV(model=str(path), strategy='cuda fp16i8 *8 -> cuda fp16')
+        pipeline = PIPELINE(model, str(tokenizer_path))
+
+        result = self()
+        result.pipeline = pipeline
+        return result
+
+    def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=None, alpha_frequency=0.1, alpha_presence=0.1, token_ban=[0], token_stop=[], callback=None):
+        args = PIPELINE_ARGS(
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            alpha_frequency=alpha_frequency,  # Frequency Penalty (as in GPT-3)
+            alpha_presence=alpha_presence,  # Presence Penalty (as in GPT-3)
+            token_ban=token_ban,  # ban the generation of some tokens
+            token_stop=token_stop
+        )
+
+        return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
+
+#    def generate_with_streaming(self, **kwargs):
+#        with Iteratorize(self.generate, kwargs, callback=None) as generator:
+#            reply = ''
+#            for token in generator:
+#                reply += token
+#                yield reply
+
+
+class RWKVTokenizer:
+    def __init__(self):
+        pass
+
+    @classmethod
+    def from_pretrained(self, path):
+        tokenizer_path = path / "20B_tokenizer.json"
+        tokenizer = Tokenizer.from_file(str(tokenizer_path))
+
+        result = self()
+        result.tokenizer = tokenizer
+        return result
+
+    def encode(self, prompt):
+        return self.tokenizer.encode(prompt).ids
+
+    def decode(self, ids):
+        return self.tokenizer.decode(ids)
+
--- a/runpod/runpod-worker-transformers/model_fetcher.py
+++ b/runpod/runpod-worker-transformers/model_fetcher.py
@ -76,7 +76,8 @@ def download_model(model_name):

    # --------------------------------- RWKV Raven 7B -------------------------------- #
    elif model_name == 'rwkv-4-raven-7b':
-        hf_hub_download(repo_id="BlinkDL/rwkv-4-raven", filename="RWKV-4-Raven-7B-v7-EngAndMore-20230404-ctx4096.pth")
+        snapshot_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-raven", filename="RWKV-4-Raven-7B-v8-Eng-20230408-ctx4096.pth")
+        hf_hub_download(repo_id="BlinkDL/Raven-RWKV-7B", filename="20B_tokenizer.json", local_dir=snapshot_path)
        #https://huggingface.co/yahma/RWKV-14b_quant/resolve/main/RWKV-4-Pile-14B-20230213-8019.pqth

    if snapshot_path:
--- a/runpod/runpod-worker-transformers/runpod_infer.py
+++ b/runpod/runpod-worker-transformers/runpod_infer.py
@ -12,6 +12,9 @@ from transformers import (GPTNeoForCausalLM, GPT2Tokenizer, GPTNeoXForCausalLM,
                          AutoConfig)
 #                          LlamaForCausalLM, LlamaTokenizer)

+from pathlib import Path
+import os, sys
+

 torch.cuda.is_available()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@ -103,8 +106,6 @@ INPUT_SCHEMA = {

 def load_quantized(model_name, wbits, groupsize, device):
    """https://github.com/oobabooga/text-generation-webui/blob/main/modules/GPTQ_loader.py"""
-    from pathlib import Path
-    import os, sys
 #    os.system("mkdir repositories && git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa")
    sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
    from modelutils import find_layers
@ -301,6 +302,17 @@ if __name__ == "__main__":
 #            "chavinlo/gpt4-x-alpaca", local_files_only=True)

    elif args.model_name == 'rwkv-4-raven-7b':
-        pass
+        from RWKV import RWKVModel, RWKVTokenizer
+        path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--*/snapshots/*/") )
+        found_pths = list(path_to_model.glob("*.pth"))
+        pt_path = None
+        if len(found_pths) == 1:
+            pt_path = found_pts[0]
+        else:
+            print("Could not find the model, exiting...")
+            exit()
+        model = RWKVModel.from_pretrained(Path(str(pt_path)), dtype="fp16", device="cuda")
+        tokenizer = RWKVTokenizer.from_pretrained(Path(str(path_to_model)))
+

    runpod.serverless.start({"handler": generator})