server.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. import os
  2. import re
  3. import time
  4. import glob
  5. from sys import exit
  6. import torch
  7. import argparse
  8. import gradio as gr
  9. import transformers
  10. from transformers import AutoTokenizer
  11. from transformers import GPTJForCausalLM, AutoModelForCausalLM, AutoModelForSeq2SeqLM, OPTForCausalLM, T5Tokenizer, T5ForConditionalGeneration, GPTJModel, AutoModel
  12. parser = argparse.ArgumentParser()
  13. parser.add_argument('--model', type=str, help='Name of the model to load by default')
  14. args = parser.parse_args()
  15. loaded_preset = None
  16. available_models = sorted(set(map(lambda x : x.split('/')[-1].replace('.pt', ''), glob.glob("models/*[!\.][!t][!x][!t]")+ glob.glob("torch-dumps/*[!\.][!t][!x][!t]"))))
  17. def load_model(model_name):
  18. print(f"Loading {model_name}...")
  19. t0 = time.time()
  20. # Loading the model
  21. if os.path.exists(f"torch-dumps/{model_name}.pt"):
  22. print("Loading in .pt format...")
  23. model = torch.load(f"torch-dumps/{model_name}.pt").cuda()
  24. elif model_name.lower().startswith(('gpt-neo', 'opt-', 'galactica')):
  25. if any(size in model_name for size in ('13b', '20b', '30b')):
  26. model = AutoModelForCausalLM.from_pretrained(f"models/{model_name}", device_map='auto', load_in_8bit=True)
  27. else:
  28. model = AutoModelForCausalLM.from_pretrained(f"models/{model_name}", low_cpu_mem_usage=True, torch_dtype=torch.float16).cuda()
  29. elif model_name in ['gpt-j-6B']:
  30. model = AutoModelForCausalLM.from_pretrained(f"models/{model_name}", low_cpu_mem_usage=True, torch_dtype=torch.float16).cuda()
  31. elif model_name in ['flan-t5', 't5-large']:
  32. model = T5ForConditionalGeneration.from_pretrained(f"models/{model_name}").cuda()
  33. else:
  34. model = AutoModelForCausalLM.from_pretrained(f"models/{model_name}", low_cpu_mem_usage=True, torch_dtype=torch.float16).cuda()
  35. # Loading the tokenizer
  36. if model_name.startswith('gpt4chan'):
  37. tokenizer = AutoTokenizer.from_pretrained("models/gpt-j-6B/")
  38. elif model_name in ['flan-t5']:
  39. tokenizer = T5Tokenizer.from_pretrained(f"models/{model_name}/")
  40. else:
  41. tokenizer = AutoTokenizer.from_pretrained(f"models/{model_name}/")
  42. print(f"Loaded the model in {(time.time()-t0):.2f} seconds.")
  43. return model, tokenizer
  44. # Removes empty replies from gpt4chan outputs
  45. def fix_gpt4chan(s):
  46. for i in range(10):
  47. s = re.sub("--- [0-9]*\n>>[0-9]*\n---", "---", s)
  48. s = re.sub("--- [0-9]*\n *\n---", "---", s)
  49. s = re.sub("--- [0-9]*\n\n\n---", "---", s)
  50. return s
  51. def generate_reply(question, temperature, max_length, inference_settings, selected_model):
  52. global model, tokenizer, model_name, loaded_preset, preset
  53. if selected_model != model_name:
  54. model_name = selected_model
  55. model = None
  56. tokenier = None
  57. torch.cuda.empty_cache()
  58. model, tokenizer = load_model(model_name)
  59. if inference_settings != loaded_preset:
  60. with open(f'presets/{inference_settings}.txt', 'r') as infile:
  61. preset = infile.read()
  62. loaded_preset = inference_settings
  63. torch.cuda.empty_cache()
  64. input_text = question
  65. input_ids = tokenizer.encode(str(input_text), return_tensors='pt').cuda()
  66. output = eval(f"model.generate(input_ids, {preset}).cuda()")
  67. reply = tokenizer.decode(output[0], skip_special_tokens=True)
  68. if model_name.startswith('gpt4chan'):
  69. reply = fix_gpt4chan(reply)
  70. return reply
  71. # Choosing the default model
  72. if args.model is not None:
  73. model_name = args.model
  74. else:
  75. if len(available_models == 0):
  76. print("No models are available! Please download at least one.")
  77. exit(0)
  78. elif len(available_models) == 1:
  79. i = 0
  80. else:
  81. print("The following models are available:\n")
  82. for i,model in enumerate(available_models):
  83. print(f"{i+1}. {model}")
  84. print(f"\nWhich one do you want to load? 1-{len(available_models)}\n")
  85. i = int(input())-1
  86. model_name = available_models[i]
  87. model, tokenizer = load_model(model_name)
  88. if model_name.startswith('gpt4chan'):
  89. default_text = "-----\n--- 865467536\nInput text\n--- 865467537\n"
  90. else:
  91. default_text = "Common sense questions and answers\n\nQuestion: \nFactual answer:"
  92. interface = gr.Interface(
  93. generate_reply,
  94. inputs=[
  95. gr.Textbox(value=default_text, lines=15),
  96. gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label='Temperature', value=0.7),
  97. gr.Slider(minimum=1, maximum=2000, step=1, label='max_length', value=200),
  98. gr.Dropdown(choices=list(map(lambda x : x.split('/')[-1].split('.')[0], glob.glob("presets/*.txt"))), value="Default"),
  99. gr.Dropdown(choices=available_models, value=model_name),
  100. ],
  101. outputs=[
  102. gr.Textbox(placeholder="", lines=15),
  103. ],
  104. title="Text generation lab",
  105. description=f"Generate text using Large Language Models.",
  106. )
  107. interface.launch(share=False, server_name="0.0.0.0")