import asyncio import requests import json import logging logger = logging.getLogger(__name__) class KoboldCppTextWrapper(object): """Base Class for koboldcpp""" def __init__(self, endpoint_name: str, model_name: str): self.endpoint_name = endpoint_name self.model_name = model_name def setup(): os.system("mkdir -p repositories && (cd repositories && git clone https://github.com/LostRuins/koboldcpp.git)") os.system("apt update && apt-get install libopenblas-dev libclblast-dev libmkl-dev") os.system("(cd repositories/koboldcpp && make LLAMA_OPENBLAS=1 && cd models && wget https://huggingface.co/concedo/pygmalion-6bv3-ggml-ggjt/resolve/main/pygmalion-6b-v3-ggml-ggjt-q4_0.bin)") #python3 koboldcpp.py models/pygmalion-6b-v3-ggml-ggjt-q4_0.bin #python3 koboldcpp.py --smartcontext models/pygmalion-6b-v3-ggml-ggjt-q4_0.bin async def generate(self, prompt: str, typing_fn, temperature=0.72, max_new_tokens=200, timeout=180): # Set the API endpoint URL endpoint = f"http://{self.endpoint_name}/api/latest/generate" # Set the headers for the request headers = { "Content-Type": "application/json", } # Define your inputs input_data = { "prompt": prompt, "max_context_length": 2048, "max_length": max_new_tokens, "temperature": temperature, "top_k": 50, "top_p": 0.85, "rep_pen": 1.08, "rep_pen_range": 1024, "stop_sequence": ['<|endoftext|>'], } logger.info(f"sending request to koboldcpp. endpoint=\"{self.endpoint_name}\"") TRIES = 30 for i in range(TRIES): r = requests.post(endpoint, json=input_data, headers=headers, timeout=timeout) r_json = r.json() logger.info(r_json) if r.status_code == 200: output = r_json["results"][0]["text"] return output elif r.status_code == 503: logger.info(f"api is busy. waiting...") asyncio.sleep(5) raise ValueError(f" TIMEOUT / NO OUTOUT")