diff --git a/matrix_pygmalion_bot/bot/ai/langchain.py b/matrix_pygmalion_bot/bot/ai/langchain.py index ee61a00..c6a095b 100644 --- a/matrix_pygmalion_bot/bot/ai/langchain.py +++ b/matrix_pygmalion_bot/bot/ai/langchain.py @@ -90,10 +90,11 @@ class AI(object): self.bot = bot self.memory_path = memory_path self.rooms = {} + self.max_context = 2048 from ..wrappers.langchain_koboldcpp import KoboldCpp - self.llm_chat = KoboldCpp(temperature=self.bot.temperature, endpoint_url="http://172.16.85.10:5001/api/latest/generate", stop=['<|endoftext|>'], verbose=True) - self.llm_summary = KoboldCpp(temperature=0.7, repeat_penalty=1.176, top_k = 40, top_p= 0.1, endpoint_url="http://172.16.85.10:5001/api/latest/generate", stop=['<|endoftext|>'], max_tokens=512, verbose=True) + self.llm_chat = KoboldCpp(temperature=self.bot.temperature, endpoint_url="http://172.16.85.10:5001/api/latest/generate", max_context=self.max_context, stop=['<|endoftext|>'], verbose=True) + self.llm_summary = KoboldCpp(temperature=0.7, repeat_penalty=1.15, top_k = 20, top_p= 0.9, endpoint_url="http://172.16.85.10:5001/api/latest/generate", max_context=self.max_context, stop=['<|endoftext|>'], max_tokens=512, verbose=True) self.llm_chat_model = "pygmalion-7b" self.llm_summary_model = "vicuna-13b" self.text_wrapper = text_wrapper @@ -122,7 +123,7 @@ class AI(object): last_message_ids_summarized = [] if not human_prefix: human_prefix = "Human" - memory = CustomMemory(memory_key="chat_history", input_key="input", human_prefix=human_prefix, ai_prefix=self.bot.name, llm=self.llm_summary, summary_prompt=prompt_progressive_summary, moving_summary_buffer=moving_summary, max_len=1200, min_len=200, last_message_ids_summarized=last_message_ids_summarized) + memory = CustomMemory(memory_key="chat_history", input_key="input", human_prefix=human_prefix, ai_prefix=self.bot.name, llm=self.llm_summary, summary_prompt=prompt_progressive_summary, moving_summary_buffer=moving_summary, max_len=int(self.max_context-800), min_len=int(0.1*self.max_context), last_message_ids_summarized=last_message_ids_summarized) self.rooms[room_id]["memory"] = memory #memory.chat_memory.add_ai_message(self.bot.greeting) else: @@ -251,7 +252,7 @@ class AI(object): tmp_prompt_text = prompt.format(chat_history=conversation_memory.buffer, input=message.content) prompt_len = self.llm_chat.get_num_tokens(tmp_prompt_text) - if prompt_len+200 > 2048: + if prompt_len+200 > self.max_context: logger.warning(f"Prompt too large. Estimated {prompt_len} tokens. Summarizing...") await reply_fn(f" Prompt too large. Estimated {prompt_len} tokens") if i == 0: @@ -460,6 +461,8 @@ class AI(object): conversation_memory.chat_memory.messages.append(message) #conversation_memory.chat_memory.add_system_message(message) + # [ 21:30 | Tuesday 9th | Pentagram City Alleys | 18°C | Overcast | 92% ] + # Summarize the last day and save a diary entry yesterday = ( datetime.now() - timedelta(days=1) ).strftime('%Y-%m-%d') for room_id in self.rooms.keys(): diff --git a/matrix_pygmalion_bot/bot/wrappers/koboldcpp.py b/matrix_pygmalion_bot/bot/wrappers/koboldcpp.py index 9af8f03..eb19760 100644 --- a/matrix_pygmalion_bot/bot/wrappers/koboldcpp.py +++ b/matrix_pygmalion_bot/bot/wrappers/koboldcpp.py @@ -19,7 +19,7 @@ class KoboldCppTextWrapper(object): #python3 koboldcpp.py models/pygmalion-6b-v3-ggml-ggjt-q4_0.bin #python3 koboldcpp.py --smartcontext models/pygmalion-6b-v3-ggml-ggjt-q4_0.bin - async def generate(self, prompt: str, typing_fn, temperature=0.72, max_new_tokens=200, timeout=180): + async def generate(self, prompt: str, typing_fn, temperature=0.72, max_new_tokens=200, max_context=2048, timeout=180): # Set the API endpoint URL endpoint = f"http://{self.endpoint_name}/api/latest/generate" @@ -31,12 +31,12 @@ class KoboldCppTextWrapper(object): # Define your inputs input_data = { "prompt": prompt, - "max_context_length": 2048, + "max_context_length": max_context, "max_length": max_new_tokens, "temperature": temperature, - "top_k": 50, - "top_p": 0.85, - "rep_pen": 1.08, + "top_k": 20, + "top_p": 0.9, + "rep_pen": 1.15, "rep_pen_range": 1024, "stop_sequence": ['<|endoftext|>'], } diff --git a/matrix_pygmalion_bot/bot/wrappers/langchain_koboldcpp.py b/matrix_pygmalion_bot/bot/wrappers/langchain_koboldcpp.py index 7b42b6c..8eb3b9d 100644 --- a/matrix_pygmalion_bot/bot/wrappers/langchain_koboldcpp.py +++ b/matrix_pygmalion_bot/bot/wrappers/langchain_koboldcpp.py @@ -20,19 +20,22 @@ class KoboldCpp(LLM): endpoint_url: str = "http://172.16.85.10:5001/api/latest/generate" - temperature: Optional[float] = 0.8 + temperature: Optional[float] = 0.7 """The temperature to use for sampling.""" + max_context: Optional[int] = 2048 + """The maximum context size.""" + max_tokens: Optional[int] = 256 """The maximum number of tokens to generate.""" top_p: Optional[float] = 0.90 """The top-p value to use for sampling.""" - repeat_penalty: Optional[float] = 1.1 + repeat_penalty: Optional[float] = 1.15 """The penalty to apply to repeated tokens.""" - top_k: Optional[int] = 40 + top_k: Optional[int] = 20 """The top-k value to use for sampling.""" stop: Optional[List[str]] = [] @@ -51,7 +54,7 @@ class KoboldCpp(LLM): #params = self.model_kwargs or {} input_data = { "prompt": prompt, - "max_context_length": 2048, + "max_context_length": self.max_context, "max_length": self.max_tokens, "temperature": self.temperature, "top_k": self.top_k, @@ -101,7 +104,7 @@ class KoboldCpp(LLM): #params = self.model_kwargs or {} input_data = { "prompt": prompt, - "max_context_length": 2048, + "max_context_length": self.max_context, "max_length": self.max_tokens, "temperature": self.temperature, "top_k": self.top_k,