|
|
@@ -118,11 +118,9 @@ def generate_reply(question, generate_params, eos_token=None, stopping_strings=[
|
|
|
# These models are not part of Hugging Face, so we handle them
|
|
|
# separately and terminate the function call earlier
|
|
|
if any((shared.is_RWKV, shared.is_llamacpp)):
|
|
|
-
|
|
|
for k in ['temperature', 'top_p', 'top_k', 'repetition_penalty']:
|
|
|
updated_params[k] = generate_params[k]
|
|
|
updated_params["token_count"] = generate_params["max_new_tokens"]
|
|
|
-
|
|
|
try:
|
|
|
if shared.args.no_stream:
|
|
|
reply = shared.model.generate(context=question, **updated_params)
|
|
|
@@ -166,11 +164,10 @@ def generate_reply(question, generate_params, eos_token=None, stopping_strings=[
|
|
|
|
|
|
updated_params["max_new_tokens"] = generate_params['max_new_tokens']
|
|
|
if not shared.args.flexgen:
|
|
|
- updated_params["eos_token_id"] = eos_token_ids
|
|
|
- updated_params["stopping_criteria"] = stopping_criteria_list
|
|
|
for k in ["do_sample", "temperature", "top_p", "typical_p", "repetition_penalty", "encoder_repetition_penalty", "top_k", "min_length", "no_repeat_ngram_size", "num_beams", "penalty_alpha", "length_penalty", "early_stopping"]:
|
|
|
updated_params[k] = generate_params[k]
|
|
|
-
|
|
|
+ updated_params["eos_token_id"] = eos_token_ids
|
|
|
+ updated_params["stopping_criteria"] = stopping_criteria_list
|
|
|
if shared.args.no_stream:
|
|
|
updated_params["min_length"] = 0
|
|
|
else:
|
|
|
@@ -179,7 +176,6 @@ def generate_reply(question, generate_params, eos_token=None, stopping_strings=[
|
|
|
updated_params["stop"] = generate_params["eos_token_ids"][-1]
|
|
|
if not shared.args.no_stream:
|
|
|
updated_params["max_new_tokens"] = 8
|
|
|
- print(updated_params)
|
|
|
|
|
|
if shared.args.no_cache:
|
|
|
updated_params.update({"use_cache": False})
|