Browse Source

rewrite remote transformers container

master
Hendrik Langer 2 years ago
parent
commit
698d59bd34
  1. 11
      runpod/runpod-worker-transformers/Dockerfile
  2. 10
      runpod/runpod-worker-transformers/model_fetcher.py
  3. 84
      runpod/runpod-worker-transformers/runpod_infer.py

11
runpod/runpod-worker-transformers/Dockerfile

@ -1,6 +1,6 @@
#ARG BASE_IMAGE=nvidia/cuda:12.0.1-cudnn8-runtime-ubuntu22.04
ARG BASE_IMAGE=nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
#ARG BASE_IMAGE=nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
ARG BASE_IMAGE=runpod/pytorch:3.10-2.0.0-117
#ARG BASE_IMAGE=runpod/pytorch:3.10-2.0.0-117
#ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.03-py3
FROM ${BASE_IMAGE} as dev-base
@ -48,17 +48,20 @@ RUN apt-get update --yes && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN pip3 install --upgrade pip && \
# pip3 install torch torchvision torchaudio --extra-index-url=https://download.pytorch.org/whl/cu118 && \
pip3 install torch torchvision torchaudio --extra-index-url=https://download.pytorch.org/whl/cu118 && \
pip3 install bitsandbytes && \
pip3 install safetensors && \
pip3 install diffusers && \
pip3 install transformers accelerate xformers triton && \
pip3 install huggingface-hub && \
pip3 install runpod
pip3 install runpod && \
pip3 cache purge
RUN mkdir -p /workspace
WORKDIR /workspace
RUN mkdir repositories && git clone --branch cuda --single-branch --depth 1 https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa
#RUN mkdir repositories && git clone https://github.com/AlpinDale/gptq-gptj.git repositories/GPTQ-for-LLaMa && (cd repositories/GPTQ-for-LLaMa && python3 setup_cuda.py install)
COPY model_fetcher.py /workspace/
RUN python3 model_fetcher.py --model_name=${MODEL_NAME}
#RUN git lfs install && \

10
runpod/runpod-worker-transformers/model_fetcher.py

@ -2,6 +2,7 @@
RunPod | Transformer | Model Fetcher
'''
import os
import argparse
import torch
@ -30,17 +31,17 @@ def download_model(model_name):
elif model_name == 'pygmalion-6b':
# AutoModelForCausalLM.from_pretrained("PygmalionAI/pygmalion-6b", load_in_8bit=True)
# AutoTokenizer.from_pretrained("PygmalionAI/pygmalion-6b")
snapshot_download(repo_id="PygmalionAI/pygmalion-6b", revision="main")
snapshot_path = snapshot_download(repo_id="PygmalionAI/pygmalion-6b", revision="main")
# --------------------------------- Pygmalion -------------------------------- #
elif model_name == 'pygmalion-6b-4bit-128g':
snapshot_download(repo_id="mayaeary/pygmalion-6b-4bit-128g", revision="main")
snapshot_path = snapshot_download(repo_id="mayaeary/pygmalion-6b-4bit-128g", revision="main")
# --------------------------------- Pygmalion -------------------------------- #
elif model_name == 'pygmalion-6b-gptq-4bit':
# AutoModelForCausalLM.from_pretrained("OccamRazor/pygmalion-6b-gptq-4bit", from_pt=True)
# AutoTokenizer.from_pretrained("OccamRazor/pygmalion-6b-gptq-4bit")
snapshot_download(repo_id="OccamRazor/pygmalion-6b-gptq-4bit", revision="main")
snapshot_path = snapshot_download(repo_id="OccamRazor/pygmalion-6b-gptq-4bit", revision="main")
# ----------------------------------- GPT-J ----------------------------------- #
elif model_name == 'gpt-j-6b':
@ -78,6 +79,9 @@ def download_model(model_name):
hf_hub_download(repo_id="BlinkDL/rwkv-4-raven", filename="RWKV-4-Raven-7B-v7-EngAndMore-20230404-ctx4096.pth")
#https://huggingface.co/yahma/RWKV-14b_quant/resolve/main/RWKV-4-Pile-14B-20230213-8019.pqth
if snapshot_path:
os.system("ln -s \"{snapshot_path}\" /workdir/model")
# ---------------------------------------------------------------------------- #
# Parse Arguments #
# ---------------------------------------------------------------------------- #

84
runpod/runpod-worker-transformers/runpod_infer.py

@ -3,11 +3,13 @@ RunPod | Transformer | Handler
'''
import argparse
import accelerate
import torch
import runpod
from runpod.serverless.utils.rp_validator import validate
from transformers import (GPTNeoForCausalLM, GPT2Tokenizer, GPTNeoXForCausalLM,
GPTNeoXTokenizerFast, GPTJForCausalLM, AutoTokenizer, AutoModelForCausalLM,)
GPTNeoXTokenizerFast, GPTJForCausalLM, AutoTokenizer, AutoModelForCausalLM,
AutoConfig)
# LlamaForCausalLM, LlamaTokenizer)
@ -99,6 +101,81 @@ INPUT_SCHEMA = {
}
def load_quantized(model_name, wbits, groupsize):
"""https://github.com/oobabooga/text-generation-webui/blob/main/modules/GPTQ_loader.py"""
from pathlib import Path
import os, sys
# os.system("mkdir repositories && git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa")
sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
from modelutils import find_layers
from quant import make_quant
import transformers
from transformers import AutoConfig
# Try to determine model type from model name
name = model_name.lower()
if any((k in name for k in ['llama', 'alpaca', 'vicuna'])):
model_type = 'llama'
elif any((k in name for k in ['opt-', 'galactica'])):
model_type = 'opt'
elif any((k in name for k in ['gpt-j', 'pygmalion-6b'])):
model_type = 'gptj'
else:
print("Can't determine model type from model name."
"argument")
exit()
# Now we are going to try to locate the quantized model file.
#path_to_model = Path(f'/workdir/model')
path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--*/snapshots/*/") )
found_pts = list(path_to_model.glob("*.pt"))
found_safetensors = list(path_to_model.glob("*.safetensors"))
pt_path = None
if len(found_pts) == 1:
pt_path = found_pts[0]
elif len(found_safetensors) == 1:
pt_path = found_safetensors[0]
else:
pass
if not pt_path:
print("Could not find the quantized model in .pt or .safetensors format, exiting...")
exit()
config = AutoConfig.from_pretrained(str(path_to_model))
def noop(*args, **kwargs):
pass
torch.nn.init.kaiming_uniform_ = noop
torch.nn.init.uniform_ = noop
torch.nn.init.normal_ = noop
torch.set_default_dtype(torch.half)
transformers.modeling_utils._init_weights = False
torch.set_default_dtype(torch.half)
model = AutoModelForCausalLM.from_config(config)
torch.set_default_dtype(torch.float)
model = model.eval()
layers = find_layers(model)
for name in ['lm_head']:
if name in layers:
del layers[name]
make_quant(model, layers, wbits, groupsize)
del layers
print('Loading model ...')
if str(pt_path).endswith('.safetensors'):
from safetensors.torch import load_file as safe_load
model.load_state_dict(safe_load(str(pt_path)))
else:
model.load_state_dict(torch.load(str(pt_path)))
model.seqlen = 2048
print('Done.')
return model
def generator(job):
'''
Run the job input to generate text output.
@ -169,8 +246,9 @@ if __name__ == "__main__":
"PygmalionAI/pygmalion-6b", local_files_only=True)
elif args.model_name == 'pygmalion-6b-4bit-128g':
model = AutoModelForCausalLM.from_pretrained(
"mayaeary/pygmalion-6b-4bit-128g", local_files_only=True).to(device)
# model = AutoModelForCausalLM.from_pretrained(
# "mayaeary/pygmalion-6b-4bit-128g", local_files_only=True).to(device)
model = load_quantized("pygmalion-6b-4bit-128g", 4, 128).to(device)
tokenizer = AutoTokenizer.from_pretrained(
"mayaeary/pygmalion-6b-4bit-128g", local_files_only=True)

Loading…
Cancel
Save