tweak llm parameters

2 years ago · 104ace0c61
3 changed files with 20 additions and 14 deletions
--- a/matrix_pygmalion_bot/bot/ai/langchain.py
+++ b/matrix_pygmalion_bot/bot/ai/langchain.py
@ -90,10 +90,11 @@ class AI(object):
        self.bot = bot
        self.memory_path = memory_path
        self.rooms = {}
+        self.max_context = 2048

        from ..wrappers.langchain_koboldcpp import KoboldCpp
-        self.llm_chat = KoboldCpp(temperature=self.bot.temperature, endpoint_url="http://172.16.85.10:5001/api/latest/generate", stop=['<|endoftext|>'], verbose=True)
-        self.llm_summary = KoboldCpp(temperature=0.7, repeat_penalty=1.176, top_k = 40, top_p= 0.1, endpoint_url="http://172.16.85.10:5001/api/latest/generate", stop=['<|endoftext|>'], max_tokens=512, verbose=True)
+        self.llm_chat = KoboldCpp(temperature=self.bot.temperature, endpoint_url="http://172.16.85.10:5001/api/latest/generate", max_context=self.max_context, stop=['<|endoftext|>'], verbose=True)
+        self.llm_summary = KoboldCpp(temperature=0.7, repeat_penalty=1.15, top_k = 20, top_p= 0.9, endpoint_url="http://172.16.85.10:5001/api/latest/generate", max_context=self.max_context, stop=['<|endoftext|>'], max_tokens=512, verbose=True)
        self.llm_chat_model = "pygmalion-7b"
        self.llm_summary_model = "vicuna-13b"
        self.text_wrapper = text_wrapper
@ -122,7 +123,7 @@ class AI(object):
                last_message_ids_summarized = []
            if not human_prefix:
                human_prefix = "Human"
-            memory = CustomMemory(memory_key="chat_history", input_key="input", human_prefix=human_prefix, ai_prefix=self.bot.name, llm=self.llm_summary, summary_prompt=prompt_progressive_summary, moving_summary_buffer=moving_summary, max_len=1200, min_len=200, last_message_ids_summarized=last_message_ids_summarized)
+            memory = CustomMemory(memory_key="chat_history", input_key="input", human_prefix=human_prefix, ai_prefix=self.bot.name, llm=self.llm_summary, summary_prompt=prompt_progressive_summary, moving_summary_buffer=moving_summary, max_len=int(self.max_context-800), min_len=int(0.1*self.max_context), last_message_ids_summarized=last_message_ids_summarized)
            self.rooms[room_id]["memory"] = memory
            #memory.chat_memory.add_ai_message(self.bot.greeting)
        else:
@ -251,7 +252,7 @@ class AI(object):
            tmp_prompt_text = prompt.format(chat_history=conversation_memory.buffer, input=message.content)
            prompt_len = self.llm_chat.get_num_tokens(tmp_prompt_text)

-            if prompt_len+200 > 2048:
+            if prompt_len+200 > self.max_context:
                logger.warning(f"Prompt too large. Estimated {prompt_len} tokens. Summarizing...")
                await reply_fn(f"<WARNING> Prompt too large. Estimated {prompt_len} tokens")
                if i == 0:
@ -460,6 +461,8 @@ class AI(object):
            conversation_memory.chat_memory.messages.append(message)
            #conversation_memory.chat_memory.add_system_message(message)

+            # [ 21:30 | Tuesday 9th | Pentagram City Alleys | 18°C | Overcast | 92% ]
+
        # Summarize the last day and save a diary entry
        yesterday = ( datetime.now() - timedelta(days=1) ).strftime('%Y-%m-%d')
        for room_id in self.rooms.keys():
--- a/matrix_pygmalion_bot/bot/wrappers/koboldcpp.py
+++ b/matrix_pygmalion_bot/bot/wrappers/koboldcpp.py
@ -19,7 +19,7 @@ class KoboldCppTextWrapper(object):
        #python3 koboldcpp.py models/pygmalion-6b-v3-ggml-ggjt-q4_0.bin
        #python3 koboldcpp.py --smartcontext models/pygmalion-6b-v3-ggml-ggjt-q4_0.bin

-    async def generate(self, prompt: str, typing_fn, temperature=0.72, max_new_tokens=200, timeout=180):
+    async def generate(self, prompt: str, typing_fn, temperature=0.72, max_new_tokens=200, max_context=2048, timeout=180):
        # Set the API endpoint URL
        endpoint = f"http://{self.endpoint_name}/api/latest/generate"

@ -31,12 +31,12 @@ class KoboldCppTextWrapper(object):
        # Define your inputs
        input_data = {
            "prompt": prompt,
-            "max_context_length": 2048,
+            "max_context_length": max_context,
            "max_length": max_new_tokens,
            "temperature": temperature,
-            "top_k": 50,
-            "top_p": 0.85,
-            "rep_pen": 1.08,
+            "top_k": 20,
+            "top_p": 0.9,
+            "rep_pen": 1.15,
            "rep_pen_range": 1024,
            "stop_sequence": ['<|endoftext|>'],
        }
--- a/matrix_pygmalion_bot/bot/wrappers/langchain_koboldcpp.py
+++ b/matrix_pygmalion_bot/bot/wrappers/langchain_koboldcpp.py
@ -20,19 +20,22 @@ class KoboldCpp(LLM):

    endpoint_url: str = "http://172.16.85.10:5001/api/latest/generate"

-    temperature: Optional[float] = 0.8
+    temperature: Optional[float] = 0.7
    """The temperature to use for sampling."""

+    max_context: Optional[int] = 2048
+    """The maximum context size."""
+
    max_tokens: Optional[int] = 256
    """The maximum number of tokens to generate."""

    top_p: Optional[float] = 0.90
    """The top-p value to use for sampling."""

-    repeat_penalty: Optional[float] = 1.1
+    repeat_penalty: Optional[float] = 1.15
    """The penalty to apply to repeated tokens."""

-    top_k: Optional[int] = 40
+    top_k: Optional[int] = 20
    """The top-k value to use for sampling."""

    stop: Optional[List[str]] = []
@ -51,7 +54,7 @@ class KoboldCpp(LLM):
        #params = self.model_kwargs or {}
        input_data = {
            "prompt": prompt,
-            "max_context_length": 2048,
+            "max_context_length": self.max_context,
            "max_length": self.max_tokens,
            "temperature": self.temperature,
            "top_k": self.top_k,
@ -101,7 +104,7 @@ class KoboldCpp(LLM):
        #params = self.model_kwargs or {}
        input_data = {
            "prompt": prompt,
-            "max_context_length": 2048,
+            "max_context_length": self.max_context,
            "max_length": self.max_tokens,
            "temperature": self.temperature,
            "top_k": self.top_k,