From 698d59bd34139bc0521cf87877692d187904d47c Mon Sep 17 00:00:00 2001 From: Hendrik Langer Date: Thu, 6 Apr 2023 18:37:08 +0200 Subject: [PATCH] rewrite remote transformers container --- runpod/runpod-worker-transformers/Dockerfile | 11 ++- .../model_fetcher.py | 10 ++- .../runpod_infer.py | 84 ++++++++++++++++++- 3 files changed, 95 insertions(+), 10 deletions(-) diff --git a/runpod/runpod-worker-transformers/Dockerfile b/runpod/runpod-worker-transformers/Dockerfile index 00b6556..691d2e4 100644 --- a/runpod/runpod-worker-transformers/Dockerfile +++ b/runpod/runpod-worker-transformers/Dockerfile @@ -1,6 +1,6 @@ -#ARG BASE_IMAGE=nvidia/cuda:12.0.1-cudnn8-runtime-ubuntu22.04 +ARG BASE_IMAGE=nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04 #ARG BASE_IMAGE=nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04 -ARG BASE_IMAGE=runpod/pytorch:3.10-2.0.0-117 +#ARG BASE_IMAGE=runpod/pytorch:3.10-2.0.0-117 #ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.03-py3 FROM ${BASE_IMAGE} as dev-base @@ -48,17 +48,20 @@ RUN apt-get update --yes && \ apt-get clean && rm -rf /var/lib/apt/lists/* RUN pip3 install --upgrade pip && \ -# pip3 install torch torchvision torchaudio --extra-index-url=https://download.pytorch.org/whl/cu118 && \ + pip3 install torch torchvision torchaudio --extra-index-url=https://download.pytorch.org/whl/cu118 && \ pip3 install bitsandbytes && \ pip3 install safetensors && \ pip3 install diffusers && \ pip3 install transformers accelerate xformers triton && \ pip3 install huggingface-hub && \ - pip3 install runpod + pip3 install runpod && \ + pip3 cache purge RUN mkdir -p /workspace WORKDIR /workspace +RUN mkdir repositories && git clone --branch cuda --single-branch --depth 1 https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa +#RUN mkdir repositories && git clone https://github.com/AlpinDale/gptq-gptj.git repositories/GPTQ-for-LLaMa && (cd repositories/GPTQ-for-LLaMa && python3 setup_cuda.py install) COPY model_fetcher.py /workspace/ RUN python3 model_fetcher.py --model_name=${MODEL_NAME} #RUN git lfs install && \ diff --git a/runpod/runpod-worker-transformers/model_fetcher.py b/runpod/runpod-worker-transformers/model_fetcher.py index 2be448c..ffe8547 100644 --- a/runpod/runpod-worker-transformers/model_fetcher.py +++ b/runpod/runpod-worker-transformers/model_fetcher.py @@ -2,6 +2,7 @@ RunPod | Transformer | Model Fetcher ''' +import os import argparse import torch @@ -30,17 +31,17 @@ def download_model(model_name): elif model_name == 'pygmalion-6b': # AutoModelForCausalLM.from_pretrained("PygmalionAI/pygmalion-6b", load_in_8bit=True) # AutoTokenizer.from_pretrained("PygmalionAI/pygmalion-6b") - snapshot_download(repo_id="PygmalionAI/pygmalion-6b", revision="main") + snapshot_path = snapshot_download(repo_id="PygmalionAI/pygmalion-6b", revision="main") # --------------------------------- Pygmalion -------------------------------- # elif model_name == 'pygmalion-6b-4bit-128g': - snapshot_download(repo_id="mayaeary/pygmalion-6b-4bit-128g", revision="main") + snapshot_path = snapshot_download(repo_id="mayaeary/pygmalion-6b-4bit-128g", revision="main") # --------------------------------- Pygmalion -------------------------------- # elif model_name == 'pygmalion-6b-gptq-4bit': # AutoModelForCausalLM.from_pretrained("OccamRazor/pygmalion-6b-gptq-4bit", from_pt=True) # AutoTokenizer.from_pretrained("OccamRazor/pygmalion-6b-gptq-4bit") - snapshot_download(repo_id="OccamRazor/pygmalion-6b-gptq-4bit", revision="main") + snapshot_path = snapshot_download(repo_id="OccamRazor/pygmalion-6b-gptq-4bit", revision="main") # ----------------------------------- GPT-J ----------------------------------- # elif model_name == 'gpt-j-6b': @@ -78,6 +79,9 @@ def download_model(model_name): hf_hub_download(repo_id="BlinkDL/rwkv-4-raven", filename="RWKV-4-Raven-7B-v7-EngAndMore-20230404-ctx4096.pth") #https://huggingface.co/yahma/RWKV-14b_quant/resolve/main/RWKV-4-Pile-14B-20230213-8019.pqth + if snapshot_path: + os.system("ln -s \"{snapshot_path}\" /workdir/model") + # ---------------------------------------------------------------------------- # # Parse Arguments # # ---------------------------------------------------------------------------- # diff --git a/runpod/runpod-worker-transformers/runpod_infer.py b/runpod/runpod-worker-transformers/runpod_infer.py index 7054ec0..aa1a076 100644 --- a/runpod/runpod-worker-transformers/runpod_infer.py +++ b/runpod/runpod-worker-transformers/runpod_infer.py @@ -3,11 +3,13 @@ RunPod | Transformer | Handler ''' import argparse +import accelerate import torch import runpod from runpod.serverless.utils.rp_validator import validate from transformers import (GPTNeoForCausalLM, GPT2Tokenizer, GPTNeoXForCausalLM, - GPTNeoXTokenizerFast, GPTJForCausalLM, AutoTokenizer, AutoModelForCausalLM,) + GPTNeoXTokenizerFast, GPTJForCausalLM, AutoTokenizer, AutoModelForCausalLM, + AutoConfig) # LlamaForCausalLM, LlamaTokenizer) @@ -99,6 +101,81 @@ INPUT_SCHEMA = { } +def load_quantized(model_name, wbits, groupsize): + """https://github.com/oobabooga/text-generation-webui/blob/main/modules/GPTQ_loader.py""" + from pathlib import Path + import os, sys +# os.system("mkdir repositories && git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa") + sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa"))) + from modelutils import find_layers + from quant import make_quant + + import transformers + from transformers import AutoConfig + + # Try to determine model type from model name + name = model_name.lower() + if any((k in name for k in ['llama', 'alpaca', 'vicuna'])): + model_type = 'llama' + elif any((k in name for k in ['opt-', 'galactica'])): + model_type = 'opt' + elif any((k in name for k in ['gpt-j', 'pygmalion-6b'])): + model_type = 'gptj' + else: + print("Can't determine model type from model name." + "argument") + exit() + + # Now we are going to try to locate the quantized model file. + #path_to_model = Path(f'/workdir/model') + path_to_model = next( Path(f'/root/.cache/huggingface/hub/').glob("models--*/snapshots/*/") ) + found_pts = list(path_to_model.glob("*.pt")) + found_safetensors = list(path_to_model.glob("*.safetensors")) + pt_path = None + + if len(found_pts) == 1: + pt_path = found_pts[0] + elif len(found_safetensors) == 1: + pt_path = found_safetensors[0] + else: + pass + + if not pt_path: + print("Could not find the quantized model in .pt or .safetensors format, exiting...") + exit() + + config = AutoConfig.from_pretrained(str(path_to_model)) + def noop(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = AutoModelForCausalLM.from_config(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in ['lm_head']: + if name in layers: + del layers[name] + make_quant(model, layers, wbits, groupsize) + del layers + + print('Loading model ...') + if str(pt_path).endswith('.safetensors'): + from safetensors.torch import load_file as safe_load + model.load_state_dict(safe_load(str(pt_path))) + else: + model.load_state_dict(torch.load(str(pt_path))) + model.seqlen = 2048 + print('Done.') + + return model + + def generator(job): ''' Run the job input to generate text output. @@ -169,8 +246,9 @@ if __name__ == "__main__": "PygmalionAI/pygmalion-6b", local_files_only=True) elif args.model_name == 'pygmalion-6b-4bit-128g': - model = AutoModelForCausalLM.from_pretrained( - "mayaeary/pygmalion-6b-4bit-128g", local_files_only=True).to(device) +# model = AutoModelForCausalLM.from_pretrained( +# "mayaeary/pygmalion-6b-4bit-128g", local_files_only=True).to(device) + model = load_quantized("pygmalion-6b-4bit-128g", 4, 128).to(device) tokenizer = AutoTokenizer.from_pretrained( "mayaeary/pygmalion-6b-4bit-128g", local_files_only=True)