Browse Source

prepare RWKV

master
Hendrik Langer 2 years ago
parent
commit
316f50fdb5
  1. 7
      matrix_pygmalion_bot/ai/runpod_pygmalion.py
  2. 9
      runpod/runpod-worker-transformers/Dockerfile
  3. 74
      runpod/runpod-worker-transformers/RWKV.py
  4. 3
      runpod/runpod-worker-transformers/model_fetcher.py
  5. 18
      runpod/runpod-worker-transformers/runpod_infer.py

7
matrix_pygmalion_bot/ai/runpod_pygmalion.py

@ -19,10 +19,11 @@ async def generate_sync(
prompt: str,
api_key: str,
bot,
typing_fn
typing_fn,
api_endpoint = "pygmalion-6b"
):
# Set the API endpoint URL
endpoint = "https://api.runpod.ai/v2/pygmalion-6b/run"
endpoint = f"https://api.runpod.ai/v2/{api_endpoint}/run"
# Set the headers for the request
headers = {
@ -56,7 +57,7 @@ async def generate_sync(
TIMEOUT = 360
DELAY = 5
for i in range(TIMEOUT//DELAY):
endpoint = "https://api.runpod.ai/v2/pygmalion-6b/status/" + job_id
endpoint = "https://api.runpod.ai/v2/{api_endpoint}/status/" + job_id
r = requests.get(endpoint, headers=headers)
r_json = r.json()
logger.info(r_json)

9
runpod/runpod-worker-transformers/Dockerfile

@ -55,7 +55,8 @@ RUN pip3 install --upgrade pip && \
pip3 install safetensors && \
pip3 install sentencepiece && \
pip3 install diffusers && \
pip3 install git+https://github.com/huggingface/transformers accelerate xformers triton && \
pip3 install accelerate xformers triton && \
pip3 install git+https://github.com/huggingface/transformers.git && \
pip3 install huggingface-hub && \
pip3 install runpod && \
pip3 cache purge
@ -88,8 +89,9 @@ RUN pip3 install --upgrade pip && \
pip3 install safetensors && \
pip3 install sentencepiece && \
pip3 install diffusers && \
pip3 install git+https://github.com/huggingface/transformers accelerate xformers triton && \
# pip3 install rwkv && \
pip3 install accelerate xformers triton && \
pip3 install git+https://github.com/huggingface/transformers.git && \
pip3 install rwkv && \
pip3 install huggingface-hub && \
pip3 install runpod && \
pip3 cache purge
@ -103,6 +105,7 @@ COPY --from=builder /root/.cache/huggingface /root/.cache/huggingface
COPY model_fetcher.py /workspace/
COPY runpod_infer.py /workspace/
COPY RWKV.py /workspace/
COPY test_input.json /workspace/
CMD python3 -u runpod_infer.py --model_name=${MODEL_NAME}

74
runpod/runpod-worker-transformers/RWKV.py

@ -0,0 +1,74 @@
# https://github.com/oobabooga/text-generation-webui/blob/main/modules/RWKV.py
import os
from pathlib import Path
import numpy as np
from tokenizers import Tokenizer
#from modules.callbacks import Iteratorize
np.set_printoptions(precision=4, suppress=True, linewidth=200)
os.environ['RWKV_JIT_ON'] = '1'
os.environ["RWKV_CUDA_ON"] = '1'
from rwkv.model import RWKV
from rwkv.utils import PIPELINE, PIPELINE_ARGS
class RWKVModel:
def __init__(self):
pass
@classmethod
def from_pretrained(self, path, dtype="fp16", device="cuda"):
tokenizer_path = Path(f"{path.parent}/20B_tokenizer.json")
model = RWKV(model=str(path), strategy=f'{device} {dtype}')
#model = RWKV(model=str(path), strategy='cuda fp16i8 *8 -> cuda fp16')
pipeline = PIPELINE(model, str(tokenizer_path))
result = self()
result.pipeline = pipeline
return result
def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=None, alpha_frequency=0.1, alpha_presence=0.1, token_ban=[0], token_stop=[], callback=None):
args = PIPELINE_ARGS(
temperature=temperature,
top_p=top_p,
top_k=top_k,
alpha_frequency=alpha_frequency, # Frequency Penalty (as in GPT-3)
alpha_presence=alpha_presence, # Presence Penalty (as in GPT-3)
token_ban=token_ban, # ban the generation of some tokens
token_stop=token_stop
)
return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
# def generate_with_streaming(self, **kwargs):
# with Iteratorize(self.generate, kwargs, callback=None) as generator:
# reply = ''
# for token in generator:
# reply += token
# yield reply
class RWKVTokenizer:
def __init__(self):
pass
@classmethod
def from_pretrained(self, path):
tokenizer_path = path / "20B_tokenizer.json"
tokenizer = Tokenizer.from_file(str(tokenizer_path))
result = self()
result.tokenizer = tokenizer
return result
def encode(self, prompt):
return self.tokenizer.encode(prompt).ids
def decode(self, ids):
return self.tokenizer.decode(ids)

3
runpod/runpod-worker-transformers/model_fetcher.py

@ -76,7 +76,8 @@ def download_model(model_name):
# --------------------------------- RWKV Raven 7B -------------------------------- #
elif model_name == 'rwkv-4-raven-7b':
hf_hub_download(repo_id="BlinkDL/rwkv-4-raven", filename="RWKV-4-Raven-7B-v7-EngAndMore-20230404-ctx4096.pth")
snapshot_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-raven", filename="RWKV-4-Raven-7B-v8-Eng-20230408-ctx4096.pth")
hf_hub_download(repo_id="BlinkDL/Raven-RWKV-7B", filename="20B_tokenizer.json", local_dir=snapshot_path)
#https://huggingface.co/yahma/RWKV-14b_quant/resolve/main/RWKV-4-Pile-14B-20230213-8019.pqth
if snapshot_path:

18
runpod/runpod-worker-transformers/runpod_infer.py

@ -12,6 +12,9 @@ from transformers import (GPTNeoForCausalLM, GPT2Tokenizer, GPTNeoXForCausalLM,
AutoConfig)
# LlamaForCausalLM, LlamaTokenizer)
from pathlib import Path
import os, sys
torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@ -103,8 +106,6 @@ INPUT_SCHEMA = {
def load_quantized(model_name, wbits, groupsize, device):
"""https://github.com/oobabooga/text-generation-webui/blob/main/modules/GPTQ_loader.py"""
from pathlib import Path
import os, sys
# os.system("mkdir repositories && git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa")
sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
from modelutils import find_layers
@ -301,6 +302,17 @@ if __name__ == "__main__":
# "chavinlo/gpt4-x-alpaca", local_files_only=True)
elif args.model_name == 'rwkv-4-raven-7b':
pass
from RWKV import RWKVModel, RWKVTokenizer
path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--*/snapshots/*/") )
found_pths = list(path_to_model.glob("*.pth"))
pt_path = None
if len(found_pths) == 1:
pt_path = found_pts[0]
else:
print("Could not find the model, exiting...")
exit()
model = RWKVModel.from_pretrained(Path(str(pt_path)), dtype="fp16", device="cuda")
tokenizer = RWKVTokenizer.from_pretrained(Path(str(path_to_model)))
runpod.serverless.start({"handler": generator})

Loading…
Cancel
Save