пре 2 година · 4578e88ffd
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@@ -45,11 +45,11 @@ class RWKVModel:
 
				             token_stop = token_stop
			
 
				         )
			
 
				 
			
 
				-        return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
			
 
				+        return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
			
 
				 
			
 
				     def generate_with_streaming(self, **kwargs):
			
 
				         with Iteratorize(self.generate, kwargs, callback=None) as generator:
			
 
				-            reply = kwargs['context']
			
 
				+            reply = ''
			
 
				             for token in generator:
			
 
				                 reply += token
			
 
				                 yield reply
			
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -11,24 +11,22 @@ import modules.shared as shared
 
				 # Copied from https://github.com/PygmalionAI/gradio-ui/
			
 
				 class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
			
 
				 
			
 
				-    def __init__(self, sentinel_token_ids: torch.LongTensor,
			
 
				-                 starting_idx: int):
			
 
				+    def __init__(self, sentinel_token_ids: list[torch.LongTensor], starting_idx: int):
			
 
				         transformers.StoppingCriteria.__init__(self)
			
 
				         self.sentinel_token_ids = sentinel_token_ids
			
 
				         self.starting_idx = starting_idx
			
 
				 
			
 
				-    def __call__(self, input_ids: torch.LongTensor,
			
 
				-                 _scores: torch.FloatTensor) -> bool:
			
 
				+    def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
			
 
				         for sample in input_ids:
			
 
				             trimmed_sample = sample[self.starting_idx:]
			
 
				-            # Can't unfold, output is still too tiny. Skip.
			
 
				-            if trimmed_sample.shape[-1] < self.sentinel_token_ids.shape[-1]:
			
 
				-                continue
			
 
				-
			
 
				-            for window in trimmed_sample.unfold(
			
 
				-                    0, self.sentinel_token_ids.shape[-1], 1):
			
 
				-                if torch.all(torch.eq(self.sentinel_token_ids, window)):
			
 
				-                    return True
			
 
				+
			
 
				+            for i in range(len(self.sentinel_token_ids)):
			
 
				+                # Can't unfold, output is still too tiny. Skip.
			
 
				+                if trimmed_sample.shape[-1] < self.sentinel_token_ids[i].shape[-1]:
			
 
				+                    continue
			
 
				+                for window in trimmed_sample.unfold(0, self.sentinel_token_ids[i].shape[-1], 1):
			
 
				+                    if torch.all(torch.eq(self.sentinel_token_ids[i], window)):
			
 
				+                        return True
			
 
				         return False
			
 
				 
			
 
				 class Stream(transformers.StoppingCriteria):
			
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -51,41 +51,31 @@ def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat
 
				     prompt = ''.join(rows)
			
 
				     return prompt
			
 
				 
			
 
				-def extract_message_from_reply(question, reply, name1, name2, check, impersonate=False):
			
 
				+def extract_message_from_reply(reply, name1, name2, check):
			
 
				     next_character_found = False
			
 
				 
			
 
				-    asker = name1 if not impersonate else name2
			
 
				-    replier = name2 if not impersonate else name1
			
 
				-
			
 
				-    previous_idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", question)]
			
 
				-    idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", reply)]
			
 
				-    idx = idx[max(len(previous_idx)-1, 0)]
			
 
				-
			
 
				-    if not impersonate:
			
 
				-        reply = reply[idx + 1 + len(apply_extensions(f"{replier}:", "bot_prefix")):]
			
 
				-    else:
			
 
				-        reply = reply[idx + 1 + len(f"{replier}:"):]
			
 
				-
			
 
				     if check:
			
 
				         lines = reply.split('\n')
			
 
				         reply = lines[0].strip()
			
 
				         if len(lines) > 1:
			
 
				             next_character_found = True
			
 
				     else:
			
 
				-        idx = reply.find(f"\n{asker}:")
			
 
				-        if idx != -1:
			
 
				-            reply = reply[:idx]
			
 
				-            next_character_found = True
			
 
				-        reply = fix_newlines(reply)
			
 
				+        for string in [f"\n{name1}:", f"\n{name2}:"]:
			
 
				+            idx = reply.find(string)
			
 
				+            if idx != -1:
			
 
				+                reply = reply[:idx]
			
 
				+                next_character_found = True
			
 
				 
			
 
				         # If something like "\nYo" is generated just before "\nYou:"
			
 
				         # is completed, trim it
			
 
				-        next_turn = f"\n{asker}:"
			
 
				-        for j in range(len(next_turn)-1, 0, -1):
			
 
				-            if reply[-j:] == next_turn[:j]:
			
 
				-                reply = reply[:-j]
			
 
				-                break
			
 
				-
			
 
				+        if not next_character_found:
			
 
				+            for string in [f"\n{name1}:", f"\n{name2}:"]:
			
 
				+                for j in range(len(string)-1, 0, -1):
			
 
				+                    if reply[-j:] == string[:j]:
			
 
				+                        reply = reply[:-j]
			
 
				+                        break
			
 
				+
			
 
				+    reply = fix_newlines(reply)
			
 
				     return reply, next_character_found
			
 
				 
			
 
				 def stop_everything_event():
			
@@ -127,10 +117,10 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
 
				     # Generate
			
 
				     reply = ''
			
 
				     for i in range(chat_generation_attempts):
			
 
				-        for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_string=f"\n{name1}:"):
			
 
				+        for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]):
			
 
				 
			
 
				             # Extracting the reply
			
 
				-            reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check)
			
 
				+            reply, next_character_found = extract_message_from_reply(reply, name1, name2, check)
			
 
				             visible_reply = re.sub("(<USER>|<user>|{{user}})", name1_original, reply)
			
 
				             visible_reply = apply_extensions(visible_reply, "output")
			
 
				             if shared.args.chat:
			
@@ -166,8 +156,8 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
 
				     # Yield *Is typing...*
			
 
				     yield shared.processing_message
			
 
				     for i in range(chat_generation_attempts):
			
 
				-        for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_string=f"\n{name2}:"):
			
 
				-            reply, next_character_found = extract_message_from_reply(prompt, reply, name1, name2, check, impersonate=True)
			
 
				+        for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=eos_token, stopping_strings=[f"\n{name1}:", f"\n{name2}:"]):
			
 
				+            reply, next_character_found = extract_message_from_reply(reply, name1, name2, check)
			
 
				             yield reply
			
 
				             if next_character_found:
			
 
				                 break
			
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -99,25 +99,37 @@ def set_manual_seed(seed):
 
				         if torch.cuda.is_available():
			
 
				             torch.cuda.manual_seed_all(seed)
			
 
				 
			
 
				-def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=None, stopping_string=None):
			
 
				+def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, encoder_repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, seed, eos_token=None, stopping_strings=[]):
			
 
				     clear_torch_cache()
			
 
				     set_manual_seed(seed)
			
 
				     t0 = time.time()
			
 
				 
			
 
				+    original_question = question
			
 
				+    if not (shared.args.chat or shared.args.cai_chat):
			
 
				+        question = apply_extensions(question, "input")
			
 
				+    if shared.args.verbose:
			
 
				+        print(f"\n\n{question}\n--------------------\n")
			
 
				+
			
 
				     # These models are not part of Hugging Face, so we handle them
			
 
				     # separately and terminate the function call earlier
			
 
				     if shared.is_RWKV:
			
 
				         try:
			
 
				             if shared.args.no_stream:
			
 
				                 reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
			
 
				+                if not (shared.args.chat or shared.args.cai_chat):
			
 
				+                    reply = original_question + apply_extensions(reply, "output")
			
 
				                 yield formatted_outputs(reply, shared.model_name)
			
 
				             else:
			
 
				                 if not (shared.args.chat or shared.args.cai_chat):
			
 
				                     yield formatted_outputs(question, shared.model_name)
			
 
				+
			
 
				                 # RWKV has proper streaming, which is very nice.
			
 
				                 # No need to generate 8 tokens at a time.
			
 
				                 for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
			
 
				+                    if not (shared.args.chat or shared.args.cai_chat):
			
 
				+                        reply = original_question + apply_extensions(reply, "output")
			
 
				                     yield formatted_outputs(reply, shared.model_name)
			
 
				+
			
 
				         except Exception:
			
 
				             traceback.print_exc()
			
 
				         finally:
			
@@ -127,12 +139,6 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
 
				             print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
			
 
				             return
			
 
				 
			
 
				-    original_question = question
			
 
				-    if not (shared.args.chat or shared.args.cai_chat):
			
 
				-        question = apply_extensions(question, "input")
			
 
				-    if shared.args.verbose:
			
 
				-        print(f"\n\n{question}\n--------------------\n")
			
 
				-
			
 
				     input_ids = encode(question, max_new_tokens)
			
 
				     original_input_ids = input_ids
			
 
				     output = input_ids[0]
			
@@ -142,9 +148,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
 
				     if eos_token is not None:
			
 
				         eos_token_ids.append(int(encode(eos_token)[0][-1]))
			
 
				     stopping_criteria_list = transformers.StoppingCriteriaList()
			
 
				-    if stopping_string is not None:
			
 
				-        # Copied from https://github.com/PygmalionAI/gradio-ui/blob/master/src/model.py
			
 
				-        t = encode(stopping_string, 0, add_special_tokens=False)
			
 
				+    if type(stopping_strings) is list and len(stopping_strings) > 0:
			
 
				+        t = [encode(string, 0, add_special_tokens=False) for string in stopping_strings]
			
 
				         stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=t, starting_idx=len(input_ids[0])))
			
 
				 
			
 
				     generate_params = {}
			
@@ -195,12 +200,10 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
 
				             if shared.soft_prompt:
			
 
				                 output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
			
 
				 
			
 
				+            new_tokens = len(output) - len(input_ids[0])
			
 
				+            reply = decode(output[-new_tokens:])
			
 
				             if not (shared.args.chat or shared.args.cai_chat):
			
 
				-                new_tokens = len(output) - len(input_ids[0])
			
 
				-                reply = decode(output[-new_tokens:])
			
 
				                 reply = original_question + apply_extensions(reply, "output")
			
 
				-            else:
			
 
				-                reply = decode(output)
			
 
				 
			
 
				             yield formatted_outputs(reply, shared.model_name)
			
 
				 
			
@@ -223,12 +226,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
 
				                 for output in generator:
			
 
				                     if shared.soft_prompt:
			
 
				                         output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
			
 
				+
			
 
				+                    new_tokens = len(output) - len(input_ids[0])
			
 
				+                    reply = decode(output[-new_tokens:])
			
 
				                     if not (shared.args.chat or shared.args.cai_chat):
			
 
				-                        new_tokens = len(output) - len(input_ids[0])
			
 
				-                        reply = decode(output[-new_tokens:])
			
 
				                         reply = original_question + apply_extensions(reply, "output")
			
 
				-                    else:
			
 
				-                        reply = decode(output)
			
 
				 
			
 
				                     if output[-1] in eos_token_ids:
			
 
				                         break
			
@@ -244,12 +246,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
 
				                     output = shared.model.generate(**generate_params)[0]
			
 
				                 if shared.soft_prompt:
			
 
				                     output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
			
 
				+
			
 
				+                new_tokens = len(output) - len(original_input_ids[0])
			
 
				+                reply = decode(output[-new_tokens:])
			
 
				                 if not (shared.args.chat or shared.args.cai_chat):
			
 
				-                    new_tokens = len(output) - len(original_input_ids[0])
			
 
				-                    reply = decode(output[-new_tokens:])
			
 
				                     reply = original_question + apply_extensions(reply, "output")
			
 
				-                else:
			
 
				-                    reply = decode(output)
			
 
				 
			
 
				                 if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)):
			
 
				                     break