quant_loader.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. import sys
  2. from pathlib import Path
  3. import accelerate
  4. import torch
  5. import modules.shared as shared
  6. sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
  7. # 4-bit LLaMA
  8. def load_quantized(model_name, model_type):
  9. if model_type == 'llama':
  10. from llama import load_quant
  11. elif model_type == 'opt':
  12. from opt import load_quant
  13. else:
  14. print("Unknown pre-quantized model type specified. Only 'llama' and 'opt' are supported")
  15. exit()
  16. path_to_model = Path(f'models/{model_name}')
  17. pt_model = f'{model_name}-{shared.args.gptq_bits}bit.pt'
  18. # Try to find the .pt both in models/ and in the subfolder
  19. pt_path = None
  20. for path in [Path(p) for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]:
  21. if path.exists():
  22. pt_path = path
  23. if not pt_path:
  24. print(f"Could not find {pt_model}, exiting...")
  25. exit()
  26. model = load_quant(path_to_model, str(pt_path), shared.args.gptq_bits)
  27. # Multiple GPUs or GPU+CPU
  28. if shared.args.gpu_memory:
  29. max_memory = {}
  30. for i in range(len(shared.args.gpu_memory)):
  31. max_memory[i] = f"{shared.args.gpu_memory[i]}GiB"
  32. max_memory['cpu'] = f"{shared.args.cpu_memory or '99'}GiB"
  33. device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LLaMADecoderLayer"])
  34. model = accelerate.dispatch_model(model, device_map=device_map)
  35. # Single GPU
  36. else:
  37. model = model.to(torch.device('cuda:0'))
  38. return model