From 104ace0c611893dd1baa509f7efb87faa1a4b9be Mon Sep 17 00:00:00 2001
From: Hendrik Langer <hendrik+dev@xd0.de>
Date: Fri, 30 Jun 2023 14:37:41 +0200
Subject: [PATCH] tweak llm parameters

---
 matrix_pygmalion_bot/bot/ai/langchain.py            | 11 +++++++----
 matrix_pygmalion_bot/bot/wrappers/koboldcpp.py      | 10 +++++-----
 .../bot/wrappers/langchain_koboldcpp.py             | 13 ++++++++-----
 3 files changed, 20 insertions(+), 14 deletions(-)
diff --git a/matrix_pygmalion_bot/bot/ai/langchain.py b/matrix_pygmalion_bot/bot/ai/langchain.py
index ee61a00..c6a095b 100644
--- a/matrix_pygmalion_bot/bot/ai/langchain.py
+++ b/matrix_pygmalion_bot/bot/ai/langchain.py
@@ -90,10 +90,11 @@ class AI(object):
         self.bot = bot
         self.memory_path = memory_path
         self.rooms = {}
+        self.max_context = 2048
 
         from ..wrappers.langchain_koboldcpp import KoboldCpp
-        self.llm_chat = KoboldCpp(temperature=self.bot.temperature, endpoint_url="http://172.16.85.10:5001/api/latest/generate", stop=['<|endoftext|>'], verbose=True)
-        self.llm_summary = KoboldCpp(temperature=0.7, repeat_penalty=1.176, top_k = 40, top_p= 0.1, endpoint_url="http://172.16.85.10:5001/api/latest/generate", stop=['<|endoftext|>'], max_tokens=512, verbose=True)
+        self.llm_chat = KoboldCpp(temperature=self.bot.temperature, endpoint_url="http://172.16.85.10:5001/api/latest/generate", max_context=self.max_context, stop=['<|endoftext|>'], verbose=True)
+        self.llm_summary = KoboldCpp(temperature=0.7, repeat_penalty=1.15, top_k = 20, top_p= 0.9, endpoint_url="http://172.16.85.10:5001/api/latest/generate", max_context=self.max_context, stop=['<|endoftext|>'], max_tokens=512, verbose=True)
         self.llm_chat_model = "pygmalion-7b"
         self.llm_summary_model = "vicuna-13b"
         self.text_wrapper = text_wrapper
@@ -122,7 +123,7 @@ class AI(object):
                 last_message_ids_summarized = []
             if not human_prefix:
                 human_prefix = "Human"
-            memory = CustomMemory(memory_key="chat_history", input_key="input", human_prefix=human_prefix, ai_prefix=self.bot.name, llm=self.llm_summary, summary_prompt=prompt_progressive_summary, moving_summary_buffer=moving_summary, max_len=1200, min_len=200, last_message_ids_summarized=last_message_ids_summarized)
+            memory = CustomMemory(memory_key="chat_history", input_key="input", human_prefix=human_prefix, ai_prefix=self.bot.name, llm=self.llm_summary, summary_prompt=prompt_progressive_summary, moving_summary_buffer=moving_summary, max_len=int(self.max_context-800), min_len=int(0.1*self.max_context), last_message_ids_summarized=last_message_ids_summarized)
             self.rooms[room_id]["memory"] = memory
             #memory.chat_memory.add_ai_message(self.bot.greeting)
         else:
@@ -251,7 +252,7 @@ class AI(object):
             tmp_prompt_text = prompt.format(chat_history=conversation_memory.buffer, input=message.content)
             prompt_len = self.llm_chat.get_num_tokens(tmp_prompt_text)
 
-            if prompt_len+200 > 2048:
+            if prompt_len+200 > self.max_context:
                 logger.warning(f"Prompt too large. Estimated {prompt_len} tokens. Summarizing...")
                 await reply_fn(f"<WARNING> Prompt too large. Estimated {prompt_len} tokens")
                 if i == 0:
@@ -460,6 +461,8 @@ class AI(object):
             conversation_memory.chat_memory.messages.append(message)
             #conversation_memory.chat_memory.add_system_message(message)
 
+            # [ 21:30 | Tuesday 9th | Pentagram City Alleys | 18°C | Overcast | 92% ]
+
         # Summarize the last day and save a diary entry
         yesterday = ( datetime.now() - timedelta(days=1) ).strftime('%Y-%m-%d')
         for room_id in self.rooms.keys():
diff --git a/matrix_pygmalion_bot/bot/wrappers/koboldcpp.py b/matrix_pygmalion_bot/bot/wrappers/koboldcpp.py
index 9af8f03..eb19760 100644
--- a/matrix_pygmalion_bot/bot/wrappers/koboldcpp.py
+++ b/matrix_pygmalion_bot/bot/wrappers/koboldcpp.py
@@ -19,7 +19,7 @@ class KoboldCppTextWrapper(object):
         #python3 koboldcpp.py models/pygmalion-6b-v3-ggml-ggjt-q4_0.bin
         #python3 koboldcpp.py --smartcontext models/pygmalion-6b-v3-ggml-ggjt-q4_0.bin
 
-    async def generate(self, prompt: str, typing_fn, temperature=0.72, max_new_tokens=200, timeout=180):
+    async def generate(self, prompt: str, typing_fn, temperature=0.72, max_new_tokens=200, max_context=2048, timeout=180):
         # Set the API endpoint URL
         endpoint = f"http://{self.endpoint_name}/api/latest/generate"
 
@@ -31,12 +31,12 @@ class KoboldCppTextWrapper(object):
         # Define your inputs
         input_data = {
             "prompt": prompt,
-            "max_context_length": 2048,
+            "max_context_length": max_context,
             "max_length": max_new_tokens,
             "temperature": temperature,
-            "top_k": 50,
-            "top_p": 0.85,
-            "rep_pen": 1.08,
+            "top_k": 20,
+            "top_p": 0.9,
+            "rep_pen": 1.15,
             "rep_pen_range": 1024,
             "stop_sequence": ['<|endoftext|>'],
         }
diff --git a/matrix_pygmalion_bot/bot/wrappers/langchain_koboldcpp.py b/matrix_pygmalion_bot/bot/wrappers/langchain_koboldcpp.py
index 7b42b6c..8eb3b9d 100644
--- a/matrix_pygmalion_bot/bot/wrappers/langchain_koboldcpp.py
+++ b/matrix_pygmalion_bot/bot/wrappers/langchain_koboldcpp.py
@@ -20,19 +20,22 @@ class KoboldCpp(LLM):
 
     endpoint_url: str = "http://172.16.85.10:5001/api/latest/generate"
 
-    temperature: Optional[float] = 0.8
+    temperature: Optional[float] = 0.7
     """The temperature to use for sampling."""
 
+    max_context: Optional[int] = 2048
+    """The maximum context size."""
+
     max_tokens: Optional[int] = 256
     """The maximum number of tokens to generate."""
 
     top_p: Optional[float] = 0.90
     """The top-p value to use for sampling."""
 
-    repeat_penalty: Optional[float] = 1.1
+    repeat_penalty: Optional[float] = 1.15
     """The penalty to apply to repeated tokens."""
 
-    top_k: Optional[int] = 40
+    top_k: Optional[int] = 20
     """The top-k value to use for sampling."""
 
     stop: Optional[List[str]] = []
@@ -51,7 +54,7 @@ class KoboldCpp(LLM):
         #params = self.model_kwargs or {}
         input_data = {
             "prompt": prompt,
-            "max_context_length": 2048,
+            "max_context_length": self.max_context,
             "max_length": self.max_tokens,
             "temperature": self.temperature,
             "top_k": self.top_k,
@@ -101,7 +104,7 @@ class KoboldCpp(LLM):
         #params = self.model_kwargs or {}
         input_data = {
             "prompt": prompt,
-            "max_context_length": 2048,
+            "max_context_length": self.max_context,
             "max_length": self.max_tokens,
             "temperature": self.temperature,
             "top_k": self.top_k,