From 7e9918a06d380cf237fadd08740da6801c025503 Mon Sep 17 00:00:00 2001
From: Hendrik Langer <hendrik+dev@xd0.de>
Date: Wed, 12 Apr 2023 15:45:08 +0200
Subject: [PATCH] add local koboldcpp generation

---
 matrix_pygmalion_bot/ai/koboldcpp.py         | 84 ++++++++++++++++++++
 matrix_pygmalion_bot/ai/pygmalion_helpers.py | 82 +++++++++++++++++++
 matrix_pygmalion_bot/ai/runpod_pygmalion.py  | 82 ++++---------------
 matrix_pygmalion_bot/core.py                 |  1 +
 4 files changed, 181 insertions(+), 68 deletions(-)
 create mode 100644 matrix_pygmalion_bot/ai/koboldcpp.py
 create mode 100644 matrix_pygmalion_bot/ai/pygmalion_helpers.py
diff --git a/matrix_pygmalion_bot/ai/koboldcpp.py b/matrix_pygmalion_bot/ai/koboldcpp.py
new file mode 100644
index 0000000..2ecc5de
--- /dev/null
+++ b/matrix_pygmalion_bot/ai/koboldcpp.py
@@ -0,0 +1,84 @@
+import asyncio
+import os, tempfile
+import logging
+
+import json
+import requests
+
+from transformers import AutoTokenizer, AutoConfig
+from huggingface_hub import hf_hub_download
+
+import io
+import base64
+from PIL import Image, PngImagePlugin
+
+from .pygmalion_helpers import get_full_prompt, num_tokens
+
+logger = logging.getLogger(__name__)
+
+
+def setup():
+    os.system("mkdir -p repositories && (cd repositories && git clone https://github.com/LostRuins/koboldcpp.git)")
+    os.system("(cd repositories/koboldcpp && make LLAMA_OPENBLAS=1 && cd models && wget https://huggingface.co/concedo/pygmalion-6bv3-ggml-ggjt/resolve/main/pygmalion-6b-v3-ggml-ggjt-q4_0.bin)")
+    #python3 koboldcpp.py models/pygmalion-6b-v3-ggml-ggjt-q4_0.bin
+
+async def generate_sync(
+    prompt: str,
+    api_key: str,
+    bot,
+    typing_fn,
+    api_endpoint = "pygmalion-6b"
+):
+    # Set the API endpoint URL
+    endpoint = f"http://172.16.85.10:5001/api/latest/generate"
+
+    # Set the headers for the request
+    headers = {
+        "Content-Type": "application/json",
+    }
+
+    max_new_tokens = 120
+    prompt_num_tokens = await num_tokens(prompt)
+
+    # Define your inputs
+    input_data = {
+        "prompt": prompt,
+        "max_context_length": 2048,
+        "max_length": max_new_tokens,
+        "temperature": bot.temperature,
+        "top_k": 0,
+        "top_p": 0,
+        "rep_pen": 1.08,
+        "rep_pen_range": 1024,
+        "quiet": True,
+    }
+
+    logger.info(f"sending request to koboldcpp")
+
+    # Make the request
+    try:
+        r = requests.post(endpoint, json=input_data, headers=headers, timeout=360)
+    except requests.exceptions.RequestException as e:
+        raise ValueError(f"<HTTP ERROR> {e}")
+    r_json = r.json()
+    logger.info(r_json)
+
+    if r.status_code == 200:
+        reply = r_json["results"][0]["text"]
+        idx = reply.find(f"\nYou:")
+        if idx != -1:
+            reply = reply[:idx].strip()
+        else:
+            reply = reply.removesuffix('<|endoftext|>').strip()
+        reply = reply.replace(f"\n{bot.name}: ", " ")
+        reply = reply.replace(f"\n<BOT>: ", " ")
+        reply = reply.replace(f"<BOT>", "{bot.name}")
+        reply = reply.replace(f"<USER>", "You")
+        return reply.strip()
+    else:
+        raise ValueError(f"<ERROR>")
+
+
+async def generate_image(input_prompt: str, negative_prompt: str, api_url: str, api_key: str, typing_fn):
+    pass
+
diff --git a/matrix_pygmalion_bot/ai/pygmalion_helpers.py b/matrix_pygmalion_bot/ai/pygmalion_helpers.py
new file mode 100644
index 0000000..1a46879
--- /dev/null
+++ b/matrix_pygmalion_bot/ai/pygmalion_helpers.py
@@ -0,0 +1,82 @@
+import asyncio
+import os, tempfile
+import logging
+
+import json
+import requests
+
+from transformers import AutoTokenizer, AutoConfig
+from huggingface_hub import hf_hub_download
+
+import io
+import base64
+from PIL import Image, PngImagePlugin
+
+logger = logging.getLogger(__name__)
+
+
+async def get_full_prompt(simple_prompt: str, bot, chat_history):
+
+    # Prompt without history
+    prompt = bot.name + "'s Persona: " + bot.persona + "\n"
+    prompt += "Scenario: " + bot.scenario + "\n"
+    prompt += "<START>" + "\n"
+    #prompt += bot.name + ": " + bot.greeting + "\n"
+    prompt += "You: " + simple_prompt + "\n"
+    prompt += bot.name + ":"
+
+    MAX_TOKENS = 2048
+    max_new_tokens = 200
+    total_num_tokens = await num_tokens(prompt)
+    visible_history = []
+    current_message = True
+    for key, chat_item in reversed(chat_history.chat_history.items()):
+        if current_message:
+            current_message = False
+            continue
+        if chat_item.message["en"].startswith('!begin'):
+            break
+        if chat_item.message["en"].startswith('!'):
+            continue
+        if chat_item.message["en"].startswith('<ERROR>'):
+            continue
+        #if chat_item.message["en"] == bot.greeting:
+        #    continue
+        if chat_item.num_tokens == None:
+            chat_item.num_tokens = await num_tokens("{}: {}".format(chat_item.user_name, chat_item.message["en"]))
+        # TODO: is it MAX_TOKENS or MAX_TOKENS - max_new_tokens??
+        logger.debug(f"History: " + str(chat_item) + " [" + str(chat_item.num_tokens) + "]")
+        if total_num_tokens + chat_item.num_tokens < MAX_TOKENS - max_new_tokens:
+            visible_history.append(chat_item)
+            total_num_tokens += chat_item.num_tokens
+        else:
+            break
+    visible_history = reversed(visible_history)
+
+    prompt = bot.name + "'s Persona: " + bot.persona + "\n"
+    prompt += "Scenario: " + bot.scenario + "\n"
+    prompt += "<START>" + "\n"
+    #prompt += bot.name + ": " + bot.greeting + "\n"
+    for chat_item in visible_history:
+        if chat_item.is_own_message:
+            prompt += bot.name + ": " + chat_item.message["en"] + "\n"
+        else:
+            prompt += "You" + ": " + chat_item.message["en"] + "\n"
+    prompt += "You: " + simple_prompt + "\n"
+    prompt += bot.name + ":"
+
+    return prompt
+
+
+async def num_tokens(input_text: str):
+#    os.makedirs("./models/pygmalion-6b", exist_ok=True)
+#    hf_hub_download(repo_id="PygmalionAI/pygmalion-6b", filename="config.json", cache_dir="./models/pygmalion-6b")
+#    config = AutoConfig.from_pretrained("./models/pygmalion-6b/config.json")
+    tokenizer = AutoTokenizer.from_pretrained("PygmalionAI/pygmalion-6b")
+    encoding = tokenizer.encode(input_text, add_special_tokens=False)
+    max_input_size = tokenizer.max_model_input_sizes
+    return len(encoding)
+
+
+async def estimate_num_tokens(input_text: str):
+    return len(input_text)//4+1
diff --git a/matrix_pygmalion_bot/ai/runpod_pygmalion.py b/matrix_pygmalion_bot/ai/runpod_pygmalion.py
index e6c75e1..750d925 100644
--- a/matrix_pygmalion_bot/ai/runpod_pygmalion.py
+++ b/matrix_pygmalion_bot/ai/runpod_pygmalion.py
@@ -12,6 +12,8 @@ import io
 import base64
 from PIL import Image, PngImagePlugin
 
+from .pygmalion_helpers import get_full_prompt, num_tokens
+
 logger = logging.getLogger(__name__)
 
 
@@ -47,7 +49,10 @@ async def generate_sync(
     logger.info(f"sending request to runpod.io")
 
     # Make the request
-    r = requests.post(endpoint, json=input_data, headers=headers, timeout=180)
+    try:
+        r = requests.post(endpoint, json=input_data, headers=headers, timeout=180)
+    except requests.exceptions.RequestException as e:
+        raise ValueError(f"<HTTP ERROR>")
     r_json = r.json()
     logger.info(r_json)
 
@@ -95,71 +100,6 @@ async def generate_sync(
     else:
         raise ValueError(f"<ERROR>")
 
-async def get_full_prompt(simple_prompt: str, bot, chat_history):
-
-    # Prompt without history
-    prompt = bot.name + "'s Persona: " + bot.persona + "\n"
-    prompt += "Scenario: " + bot.scenario + "\n"
-    prompt += "<START>" + "\n"
-    #prompt += bot.name + ": " + bot.greeting + "\n"
-    prompt += "You: " + simple_prompt + "\n"
-    prompt += bot.name + ":"
-
-    MAX_TOKENS = 2048
-    max_new_tokens = 200
-    total_num_tokens = await num_tokens(prompt)
-    visible_history = []
-    current_message = True
-    for key, chat_item in reversed(chat_history.chat_history.items()):
-        if current_message:
-            current_message = False
-            continue
-        if chat_item.message["en"].startswith('!begin'):
-            break
-        if chat_item.message["en"].startswith('!'):
-            continue
-        if chat_item.message["en"].startswith('<ERROR>'):
-            continue
-        #if chat_item.message["en"] == bot.greeting:
-        #    continue
-        if chat_item.num_tokens == None:
-            chat_item.num_tokens = await num_tokens("{}: {}".format(chat_item.user_name, chat_item.message["en"]))
-        # TODO: is it MAX_TOKENS or MAX_TOKENS - max_new_tokens??
-        logger.debug(f"History: " + str(chat_item) + " [" + str(chat_item.num_tokens) + "]")
-        if total_num_tokens + chat_item.num_tokens < MAX_TOKENS - max_new_tokens:
-            visible_history.append(chat_item)
-            total_num_tokens += chat_item.num_tokens
-        else:
-            break
-    visible_history = reversed(visible_history)
-
-    prompt = bot.name + "'s Persona: " + bot.persona + "\n"
-    prompt += "Scenario: " + bot.scenario + "\n"
-    prompt += "<START>" + "\n"
-    #prompt += bot.name + ": " + bot.greeting + "\n"
-    for chat_item in visible_history:
-        if chat_item.is_own_message:
-            prompt += bot.name + ": " + chat_item.message["en"] + "\n"
-        else:
-            prompt += "You" + ": " + chat_item.message["en"] + "\n"
-    prompt += "You: " + simple_prompt + "\n"
-    prompt += bot.name + ":"
-
-    return prompt
-
-
-async def num_tokens(input_text: str):
-#    os.makedirs("./models/pygmalion-6b", exist_ok=True)
-#    hf_hub_download(repo_id="PygmalionAI/pygmalion-6b", filename="config.json", cache_dir="./models/pygmalion-6b")
-#    config = AutoConfig.from_pretrained("./models/pygmalion-6b/config.json")
-    tokenizer = AutoTokenizer.from_pretrained("PygmalionAI/pygmalion-6b")
-    encoding = tokenizer.encode(input_text, add_special_tokens=False)
-    max_input_size = tokenizer.max_model_input_sizes
-    return len(encoding)
-
-async def estimate_num_tokens(input_text: str):
-    return len(input_text)//4+1
-
 
 async def download_image(url, path):
     r = requests.get(url, stream=True)
@@ -194,7 +134,10 @@ async def generate_image(input_prompt: str, negative_prompt: str, api_url: str,
     logger.info(f"sending request to runpod.io")
 
     # Make the request
-    r = requests.post(endpoint, json=input_data, headers=headers)
+    try:
+        r = requests.post(endpoint, json=input_data, headers=headers)
+    except requests.exceptions.RequestException as e:
+        raise ValueError(f"<HTTP ERROR>")
     r_json = r.json()
     logger.debug(r_json)
 
@@ -292,7 +235,10 @@ async def serverless_automatic_request(payload, cmd, api_url: str, api_key: str,
     logger.info(f"sending request to runpod.io")
 
     # Make the request
-    r = requests.post(endpoint, json=input_data, headers=headers)
+    try:
+        r = requests.post(endpoint, json=input_data, headers=headers)
+    except requests.exceptions.RequestException as e:
+        raise ValueError(f"<HTTP ERROR>")
     r_json = r.json()
     logger.debug(r_json)
 
diff --git a/matrix_pygmalion_bot/core.py b/matrix_pygmalion_bot/core.py
index 6f95fec..94df4ef 100644
--- a/matrix_pygmalion_bot/core.py
+++ b/matrix_pygmalion_bot/core.py
@@ -17,6 +17,7 @@ import json
 from .helpers import Event
 from .chatlog import BotChatHistory
 ai = importlib.import_module("matrix_pygmalion_bot.ai.runpod_pygmalion")
+ai = importlib.import_module("matrix_pygmalion_bot.ai.koboldcpp")
 #ai = importlib.import_module("matrix_pygmalion_bot.ai.stablehorde")
 #from .llama_cpp import generate, get_full_prompt, get_full_prompt_chat_style
 #from .runpod_pygmalion import generate_sync, get_full_prompt