double the pseudo-streaming chunk size. nearly every reply takes more than 16 tokens.

2 years ago · 97eb29190e
1 changed files with 1 additions and 1 deletions
--- a/matrix_pygmalion_bot/ai/koboldcpp.py
+++ b/matrix_pygmalion_bot/ai/koboldcpp.py
@ -63,7 +63,7 @@ async def generate_sync(
    complete = False
    complete_reply = ""
    for i in range(TIMEOUT//DELAY):
-        input_data["max_length"] = 16 # pseudo streaming
+        input_data["max_length"] = 32 # pseudo streaming
        # Make the request
        try:
            r = requests.post(endpoint, json=input_data, headers=headers, timeout=600)