diff --git a/runpod/runpod-worker-transformers/Dockerfile b/runpod/runpod-worker-transformers/Dockerfile index 691d2e4..c2b7018 100644 --- a/runpod/runpod-worker-transformers/Dockerfile +++ b/runpod/runpod-worker-transformers/Dockerfile @@ -1,8 +1,9 @@ +ARG DEV_IMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 ARG BASE_IMAGE=nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04 #ARG BASE_IMAGE=nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04 #ARG BASE_IMAGE=runpod/pytorch:3.10-2.0.0-117 #ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.03-py3 -FROM ${BASE_IMAGE} as dev-base +FROM ${DEV_IMAGE} as builder ARG MODEL_NAME ENV MODEL_NAME=${MODEL_NAME} @@ -15,16 +16,17 @@ ENV DEBIAN_FRONTEND noninteractive\ RUN apt-get update --yes && \ # - apt-get upgrade is run to patch known vulnerabilities in apt-get packages as # the ubuntu base image is rebuilt too seldom sometimes (less than once a month) - #apt-get upgrade --yes && \ + apt-get upgrade --yes && \ apt install --yes --no-install-recommends \ -# build-essential \ + build-essential \ + cmake \ ca-certificates \ git \ git-lfs \ wget \ curl \ bash \ - libgl1 \ +# libgl1 \ software-properties-common \ openssh-server && \ apt-get clean && rm -rf /var/lib/apt/lists/* && \ @@ -51,6 +53,7 @@ RUN pip3 install --upgrade pip && \ pip3 install torch torchvision torchaudio --extra-index-url=https://download.pytorch.org/whl/cu118 && \ pip3 install bitsandbytes && \ pip3 install safetensors && \ + pip3 install sentencepiece triton && \ pip3 install diffusers && \ pip3 install transformers accelerate xformers triton && \ pip3 install huggingface-hub && \ @@ -60,13 +63,44 @@ RUN pip3 install --upgrade pip && \ RUN mkdir -p /workspace WORKDIR /workspace -RUN mkdir repositories && git clone --branch cuda --single-branch --depth 1 https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa -#RUN mkdir repositories && git clone https://github.com/AlpinDale/gptq-gptj.git repositories/GPTQ-for-LLaMa && (cd repositories/GPTQ-for-LLaMa && python3 setup_cuda.py install) +ENV TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX" +RUN mkdir repositories && git clone --branch cuda --single-branch https://github.com/qwopqwop200/GPTQ-for-LLaMa.git repositories/GPTQ-for-LLaMa && \ + (cd repositories/GPTQ-for-LLaMa && git reset --hard 437154dd434c3f9d5c9c4e6f401d6d71116ac248) && \ +#RUN mkdir repositories && git clone --depth 1 https://github.com/AlpinDale/gptq-gptj.git repositories/GPTQ-for-LLaMa && \ + (cd repositories/GPTQ-for-LLaMa && python3 setup_cuda.py install) + COPY model_fetcher.py /workspace/ RUN python3 model_fetcher.py --model_name=${MODEL_NAME} + + +FROM ${BASE_IMAGE} +RUN mkdir -p /workspace +WORKDIR /workspace + +RUN apt-get update --yes && \ + apt install --yes --no-install-recommends \ + python3 python3-dev python3-venv python3-pip && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN pip3 install --upgrade pip && \ + pip3 install torch torchvision torchaudio --extra-index-url=https://download.pytorch.org/whl/cu118 && \ + pip3 install bitsandbytes && \ + pip3 install safetensors && \ + pip3 install sentencepiece triton && \ + pip3 install diffusers && \ + pip3 install transformers accelerate xformers triton && \ + pip3 install huggingface-hub && \ + pip3 install runpod && \ + pip3 cache purge + +RUN mkdir /workspace/repositories +COPY --from=builder /workspace/repositories /workspace/repositories/ +COPY --from=builder /root/.cache/huggingface /root/.cache/huggingface + #RUN git lfs install && \ # git clone --depth 1 https://huggingface.co/${MODEL_NAME} +COPY model_fetcher.py /workspace/ COPY runpod_infer.py /workspace/ COPY test_input.json /workspace/ diff --git a/runpod/runpod-worker-transformers/model_fetcher.py b/runpod/runpod-worker-transformers/model_fetcher.py index ffe8547..7a79017 100644 --- a/runpod/runpod-worker-transformers/model_fetcher.py +++ b/runpod/runpod-worker-transformers/model_fetcher.py @@ -80,6 +80,7 @@ def download_model(model_name): #https://huggingface.co/yahma/RWKV-14b_quant/resolve/main/RWKV-4-Pile-14B-20230213-8019.pqth if snapshot_path: + print("model downloaded to \"{snapshot_path}\"") os.system("ln -s \"{snapshot_path}\" /workdir/model") # ---------------------------------------------------------------------------- # diff --git a/runpod/runpod-worker-transformers/runpod_infer.py b/runpod/runpod-worker-transformers/runpod_infer.py index aa1a076..b947436 100644 --- a/runpod/runpod-worker-transformers/runpod_infer.py +++ b/runpod/runpod-worker-transformers/runpod_infer.py @@ -101,7 +101,7 @@ INPUT_SCHEMA = { } -def load_quantized(model_name, wbits, groupsize): +def load_quantized(model_name, wbits, groupsize, device): """https://github.com/oobabooga/text-generation-webui/blob/main/modules/GPTQ_loader.py""" from pathlib import Path import os, sys @@ -167,7 +167,9 @@ def load_quantized(model_name, wbits, groupsize): print('Loading model ...') if str(pt_path).endswith('.safetensors'): from safetensors.torch import load_file as safe_load - model.load_state_dict(safe_load(str(pt_path))) + if device == -1: + device = "cpu" + model.load_state_dict(safe_load(str(pt_path), device)) else: model.load_state_dict(torch.load(str(pt_path))) model.seqlen = 2048 @@ -248,7 +250,7 @@ if __name__ == "__main__": elif args.model_name == 'pygmalion-6b-4bit-128g': # model = AutoModelForCausalLM.from_pretrained( # "mayaeary/pygmalion-6b-4bit-128g", local_files_only=True).to(device) - model = load_quantized("pygmalion-6b-4bit-128g", 4, 128).to(device) + model = load_quantized("pygmalion-6b-4bit-128g", 4, 128, device).to(device) tokenizer = AutoTokenizer.from_pretrained( "mayaeary/pygmalion-6b-4bit-128g", local_files_only=True)