Browse Source

tweak llm parameters

master
Hendrik Langer 11 months ago
parent
commit
104ace0c61
  1. 11
      matrix_pygmalion_bot/bot/ai/langchain.py
  2. 10
      matrix_pygmalion_bot/bot/wrappers/koboldcpp.py
  3. 13
      matrix_pygmalion_bot/bot/wrappers/langchain_koboldcpp.py

11
matrix_pygmalion_bot/bot/ai/langchain.py

@ -90,10 +90,11 @@ class AI(object):
self.bot = bot
self.memory_path = memory_path
self.rooms = {}
self.max_context = 2048
from ..wrappers.langchain_koboldcpp import KoboldCpp
self.llm_chat = KoboldCpp(temperature=self.bot.temperature, endpoint_url="http://172.16.85.10:5001/api/latest/generate", stop=['<|endoftext|>'], verbose=True)
self.llm_summary = KoboldCpp(temperature=0.7, repeat_penalty=1.176, top_k = 40, top_p= 0.1, endpoint_url="http://172.16.85.10:5001/api/latest/generate", stop=['<|endoftext|>'], max_tokens=512, verbose=True)
self.llm_chat = KoboldCpp(temperature=self.bot.temperature, endpoint_url="http://172.16.85.10:5001/api/latest/generate", max_context=self.max_context, stop=['<|endoftext|>'], verbose=True)
self.llm_summary = KoboldCpp(temperature=0.7, repeat_penalty=1.15, top_k = 20, top_p= 0.9, endpoint_url="http://172.16.85.10:5001/api/latest/generate", max_context=self.max_context, stop=['<|endoftext|>'], max_tokens=512, verbose=True)
self.llm_chat_model = "pygmalion-7b"
self.llm_summary_model = "vicuna-13b"
self.text_wrapper = text_wrapper
@ -122,7 +123,7 @@ class AI(object):
last_message_ids_summarized = []
if not human_prefix:
human_prefix = "Human"
memory = CustomMemory(memory_key="chat_history", input_key="input", human_prefix=human_prefix, ai_prefix=self.bot.name, llm=self.llm_summary, summary_prompt=prompt_progressive_summary, moving_summary_buffer=moving_summary, max_len=1200, min_len=200, last_message_ids_summarized=last_message_ids_summarized)
memory = CustomMemory(memory_key="chat_history", input_key="input", human_prefix=human_prefix, ai_prefix=self.bot.name, llm=self.llm_summary, summary_prompt=prompt_progressive_summary, moving_summary_buffer=moving_summary, max_len=int(self.max_context-800), min_len=int(0.1*self.max_context), last_message_ids_summarized=last_message_ids_summarized)
self.rooms[room_id]["memory"] = memory
#memory.chat_memory.add_ai_message(self.bot.greeting)
else:
@ -251,7 +252,7 @@ class AI(object):
tmp_prompt_text = prompt.format(chat_history=conversation_memory.buffer, input=message.content)
prompt_len = self.llm_chat.get_num_tokens(tmp_prompt_text)
if prompt_len+200 > 2048:
if prompt_len+200 > self.max_context:
logger.warning(f"Prompt too large. Estimated {prompt_len} tokens. Summarizing...")
await reply_fn(f"<WARNING> Prompt too large. Estimated {prompt_len} tokens")
if i == 0:
@ -460,6 +461,8 @@ class AI(object):
conversation_memory.chat_memory.messages.append(message)
#conversation_memory.chat_memory.add_system_message(message)
# [ 21:30 | Tuesday 9th | Pentagram City Alleys | 18°C | Overcast | 92% ]
# Summarize the last day and save a diary entry
yesterday = ( datetime.now() - timedelta(days=1) ).strftime('%Y-%m-%d')
for room_id in self.rooms.keys():

10
matrix_pygmalion_bot/bot/wrappers/koboldcpp.py

@ -19,7 +19,7 @@ class KoboldCppTextWrapper(object):
#python3 koboldcpp.py models/pygmalion-6b-v3-ggml-ggjt-q4_0.bin
#python3 koboldcpp.py --smartcontext models/pygmalion-6b-v3-ggml-ggjt-q4_0.bin
async def generate(self, prompt: str, typing_fn, temperature=0.72, max_new_tokens=200, timeout=180):
async def generate(self, prompt: str, typing_fn, temperature=0.72, max_new_tokens=200, max_context=2048, timeout=180):
# Set the API endpoint URL
endpoint = f"http://{self.endpoint_name}/api/latest/generate"
@ -31,12 +31,12 @@ class KoboldCppTextWrapper(object):
# Define your inputs
input_data = {
"prompt": prompt,
"max_context_length": 2048,
"max_context_length": max_context,
"max_length": max_new_tokens,
"temperature": temperature,
"top_k": 50,
"top_p": 0.85,
"rep_pen": 1.08,
"top_k": 20,
"top_p": 0.9,
"rep_pen": 1.15,
"rep_pen_range": 1024,
"stop_sequence": ['<|endoftext|>'],
}

13
matrix_pygmalion_bot/bot/wrappers/langchain_koboldcpp.py

@ -20,19 +20,22 @@ class KoboldCpp(LLM):
endpoint_url: str = "http://172.16.85.10:5001/api/latest/generate"
temperature: Optional[float] = 0.8
temperature: Optional[float] = 0.7
"""The temperature to use for sampling."""
max_context: Optional[int] = 2048
"""The maximum context size."""
max_tokens: Optional[int] = 256
"""The maximum number of tokens to generate."""
top_p: Optional[float] = 0.90
"""The top-p value to use for sampling."""
repeat_penalty: Optional[float] = 1.1
repeat_penalty: Optional[float] = 1.15
"""The penalty to apply to repeated tokens."""
top_k: Optional[int] = 40
top_k: Optional[int] = 20
"""The top-k value to use for sampling."""
stop: Optional[List[str]] = []
@ -51,7 +54,7 @@ class KoboldCpp(LLM):
#params = self.model_kwargs or {}
input_data = {
"prompt": prompt,
"max_context_length": 2048,
"max_context_length": self.max_context,
"max_length": self.max_tokens,
"temperature": self.temperature,
"top_k": self.top_k,
@ -101,7 +104,7 @@ class KoboldCpp(LLM):
#params = self.model_kwargs or {}
input_data = {
"prompt": prompt,
"max_context_length": 2048,
"max_context_length": self.max_context,
"max_length": self.max_tokens,
"temperature": self.temperature,
"top_k": self.top_k,

Loading…
Cancel
Save