llamacpp_model_alternative.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. '''
  2. Based on
  3. https://github.com/abetlen/llama-cpp-python
  4. Documentation:
  5. https://abetlen.github.io/llama-cpp-python/
  6. '''
  7. import multiprocessing
  8. from llama_cpp import Llama
  9. from modules import shared
  10. from modules.callbacks import Iteratorize
  11. class LlamaCppModel:
  12. def __init__(self):
  13. self.initialized = False
  14. @classmethod
  15. def from_pretrained(self, path):
  16. result = self()
  17. params = {
  18. 'model_path': str(path),
  19. 'n_ctx': 2048,
  20. 'seed': 0,
  21. 'n_threads': shared.args.threads or None
  22. }
  23. self.model = Llama(**params)
  24. # This is ugly, but the model and the tokenizer are the same object in this library.
  25. return result, result
  26. def encode(self, string):
  27. if type(string) is str:
  28. string = string.encode()
  29. return self.model.tokenize(string)
  30. def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=1, callback=None):
  31. if type(context) is str:
  32. context = context.encode()
  33. tokens = self.model.tokenize(context)
  34. output = b""
  35. count = 0
  36. for token in self.model.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repetition_penalty):
  37. text = self.model.detokenize([token])
  38. output += text
  39. if callback:
  40. callback(text.decode())
  41. count += 1
  42. if count >= token_count or (token == self.model.token_eos()):
  43. break
  44. return output.decode()
  45. def generate_with_streaming(self, **kwargs):
  46. with Iteratorize(self.generate, kwargs, callback=None) as generator:
  47. reply = ''
  48. for token in generator:
  49. reply += token
  50. yield reply