llamacpp_model_alternative.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. '''
  2. Based on
  3. https://github.com/abetlen/llama-cpp-python
  4. Documentation:
  5. https://abetlen.github.io/llama-cpp-python/
  6. '''
  7. from llama_cpp import Llama
  8. from modules import shared
  9. from modules.callbacks import Iteratorize
  10. class LlamaCppModel:
  11. def __init__(self):
  12. self.initialized = False
  13. @classmethod
  14. def from_pretrained(self, path):
  15. result = self()
  16. params = {
  17. 'model_path': str(path),
  18. 'n_ctx': 2048,
  19. 'seed': 0,
  20. 'n_threads': shared.args.threads or None
  21. }
  22. self.model = Llama(**params)
  23. # This is ugly, but the model and the tokenizer are the same object in this library.
  24. return result, result
  25. def encode(self, string):
  26. if type(string) is str:
  27. string = string.encode()
  28. return self.model.tokenize(string)
  29. def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=1, callback=None):
  30. if type(context) is str:
  31. context = context.encode()
  32. tokens = self.model.tokenize(context)
  33. output = b""
  34. count = 0
  35. for token in self.model.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repetition_penalty):
  36. text = self.model.detokenize([token])
  37. output += text
  38. if callback:
  39. callback(text.decode())
  40. count += 1
  41. if count >= token_count or (token == self.model.token_eos()):
  42. break
  43. return output.decode()
  44. def generate_with_streaming(self, **kwargs):
  45. with Iteratorize(self.generate, kwargs, callback=None) as generator:
  46. reply = ''
  47. for token in generator:
  48. reply += token
  49. yield reply