GPTQ_loader.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. import re
  2. import sys
  3. from pathlib import Path
  4. import accelerate
  5. import torch
  6. import transformers
  7. from transformers import AutoConfig, AutoModelForCausalLM
  8. import modules.shared as shared
  9. sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
  10. import llama_inference_offload
  11. from modelutils import find_layers
  12. from quant import make_quant
  13. def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128):
  14. config = AutoConfig.from_pretrained(model)
  15. def noop(*args, **kwargs):
  16. pass
  17. torch.nn.init.kaiming_uniform_ = noop
  18. torch.nn.init.uniform_ = noop
  19. torch.nn.init.normal_ = noop
  20. torch.set_default_dtype(torch.half)
  21. transformers.modeling_utils._init_weights = False
  22. torch.set_default_dtype(torch.half)
  23. model = AutoModelForCausalLM.from_config(config)
  24. torch.set_default_dtype(torch.float)
  25. model = model.eval()
  26. layers = find_layers(model)
  27. for name in exclude_layers:
  28. if name in layers:
  29. del layers[name]
  30. make_quant(model, layers, wbits, groupsize, faster=faster_kernel, kernel_switch_threshold=kernel_switch_threshold)
  31. del layers
  32. print('Loading model ...')
  33. if checkpoint.endswith('.safetensors'):
  34. from safetensors.torch import load_file as safe_load
  35. model.load_state_dict(safe_load(checkpoint))
  36. else:
  37. model.load_state_dict(torch.load(checkpoint))
  38. model.seqlen = 2048
  39. print('Done.')
  40. return model
  41. def load_quantized(model_name):
  42. if not shared.args.model_type:
  43. # Try to determine model type from model name
  44. if model_name.lower().startswith(('llama', 'alpaca')):
  45. model_type = 'llama'
  46. elif model_name.lower().startswith(('opt', 'galactica')):
  47. model_type = 'opt'
  48. elif model_name.lower().startswith(('gpt-j', 'pygmalion-6b')):
  49. model_type = 'gptj'
  50. else:
  51. print("Can't determine model type from model name. Please specify it manually using --model_type "
  52. "argument")
  53. exit()
  54. else:
  55. model_type = shared.args.model_type.lower()
  56. if model_type == 'llama' and shared.args.pre_layer:
  57. load_quant = llama_inference_offload.load_quant
  58. elif model_type in ('llama', 'opt', 'gptj'):
  59. load_quant = _load_quant
  60. else:
  61. print("Unknown pre-quantized model type specified. Only 'llama', 'opt' and 'gptj' are supported")
  62. exit()
  63. # Now we are going to try to locate the quantized model file.
  64. path_to_model = Path(f'models/{model_name}')
  65. found_pts = list(path_to_model.glob("*.pt"))
  66. found_safetensors = list(path_to_model.glob("*.safetensors"))
  67. pt_path = None
  68. if len(found_pts) == 1:
  69. pt_path = found_pts[0]
  70. elif len(found_safetensors) == 1:
  71. pt_path = found_safetensors[0]
  72. else:
  73. if path_to_model.name.lower().startswith('llama-7b'):
  74. pt_model = f'llama-7b-{shared.args.wbits}bit'
  75. elif path_to_model.name.lower().startswith('llama-13b'):
  76. pt_model = f'llama-13b-{shared.args.wbits}bit'
  77. elif path_to_model.name.lower().startswith('llama-30b'):
  78. pt_model = f'llama-30b-{shared.args.wbits}bit'
  79. elif path_to_model.name.lower().startswith('llama-65b'):
  80. pt_model = f'llama-65b-{shared.args.wbits}bit'
  81. else:
  82. pt_model = f'{model_name}-{shared.args.wbits}bit'
  83. # Try to find the .safetensors or .pt both in models/ and in the subfolder
  84. for path in [Path(p+ext) for ext in ['.safetensors', '.pt'] for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]:
  85. if path.exists():
  86. print(f"Found {path}")
  87. pt_path = path
  88. break
  89. if not pt_path:
  90. print("Could not find the quantized model in .pt or .safetensors format, exiting...")
  91. exit()
  92. # qwopqwop200's offload
  93. if shared.args.pre_layer:
  94. model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, shared.args.pre_layer)
  95. else:
  96. threshold = False if model_type == 'gptj' else 128
  97. model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold)
  98. # accelerate offload (doesn't work properly)
  99. if shared.args.gpu_memory:
  100. memory_map = list(map(lambda x : x.strip(), shared.args.gpu_memory))
  101. max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
  102. max_memory = {}
  103. for i in range(len(memory_map)):
  104. max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i]
  105. max_memory['cpu'] = max_cpu_memory
  106. device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
  107. print("Using the following device map for the 4-bit model:", device_map)
  108. # https://huggingface.co/docs/accelerate/package_reference/big_modeling#accelerate.dispatch_model
  109. model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True)
  110. # No offload
  111. elif not shared.args.cpu:
  112. model = model.to(torch.device('cuda:0'))
  113. return model