|
@@ -46,9 +46,9 @@ def load_model(model_name):
|
|
|
# Default settings
|
|
# Default settings
|
|
|
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.gptq_bits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
|
|
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.gptq_bits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
|
|
|
if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
|
|
if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
|
|
|
- model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
|
|
|
|
|
|
|
+ model = AutoModelForCausalLM.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}"), device_map='auto', load_in_8bit=True)
|
|
|
else:
|
|
else:
|
|
|
- model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16)
|
|
|
|
|
|
|
+ model = AutoModelForCausalLM.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16)
|
|
|
if torch.has_mps:
|
|
if torch.has_mps:
|
|
|
device = torch.device('mps')
|
|
device = torch.device('mps')
|
|
|
model = model.to(device)
|
|
model = model.to(device)
|
|
@@ -76,11 +76,11 @@ def load_model(model_name):
|
|
|
num_bits=4, group_size=64,
|
|
num_bits=4, group_size=64,
|
|
|
group_dim=2, symmetric=False))
|
|
group_dim=2, symmetric=False))
|
|
|
|
|
|
|
|
- model = OptLM(f"facebook/{shared.model_name}", env, "models", policy)
|
|
|
|
|
|
|
+ model = OptLM(f"facebook/{shared.model_name}", env, shared.model_name, policy)
|
|
|
|
|
|
|
|
# DeepSpeed ZeRO-3
|
|
# DeepSpeed ZeRO-3
|
|
|
elif shared.args.deepspeed:
|
|
elif shared.args.deepspeed:
|
|
|
- model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16)
|
|
|
|
|
|
|
+ model = AutoModelForCausalLM.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}"), torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16)
|
|
|
model = deepspeed.initialize(model=model, config_params=ds_config, model_parameters=None, optimizer=None, lr_scheduler=None)[0]
|
|
model = deepspeed.initialize(model=model, config_params=ds_config, model_parameters=None, optimizer=None, lr_scheduler=None)[0]
|
|
|
model.module.eval() # Inference
|
|
model.module.eval() # Inference
|
|
|
print(f"DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}")
|
|
print(f"DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}")
|
|
@@ -89,8 +89,8 @@ def load_model(model_name):
|
|
|
elif shared.is_RWKV:
|
|
elif shared.is_RWKV:
|
|
|
from modules.RWKV import RWKVModel, RWKVTokenizer
|
|
from modules.RWKV import RWKVModel, RWKVTokenizer
|
|
|
|
|
|
|
|
- model = RWKVModel.from_pretrained(Path(f'models/{model_name}'), dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", device="cpu" if shared.args.cpu else "cuda")
|
|
|
|
|
- tokenizer = RWKVTokenizer.from_pretrained(Path('models'))
|
|
|
|
|
|
|
+ model = RWKVModel.from_pretrained(Path(f'{shared.args.model_dir}/{model_name}'), dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", device="cpu" if shared.args.cpu else "cuda")
|
|
|
|
|
+ tokenizer = RWKVTokenizer.from_pretrained(Path(shared.model_name))
|
|
|
|
|
|
|
|
return model, tokenizer
|
|
return model, tokenizer
|
|
|
|
|
|
|
@@ -142,7 +142,7 @@ def load_model(model_name):
|
|
|
if shared.args.disk:
|
|
if shared.args.disk:
|
|
|
params["offload_folder"] = shared.args.disk_cache_dir
|
|
params["offload_folder"] = shared.args.disk_cache_dir
|
|
|
|
|
|
|
|
- checkpoint = Path(f'models/{shared.model_name}')
|
|
|
|
|
|
|
+ checkpoint = Path(f'{shared.args.model_dir}/{shared.model_name}')
|
|
|
|
|
|
|
|
if shared.args.load_in_8bit and params.get('max_memory', None) is not None and params['device_map'] == 'auto':
|
|
if shared.args.load_in_8bit and params.get('max_memory', None) is not None and params['device_map'] == 'auto':
|
|
|
config = AutoConfig.from_pretrained(checkpoint)
|
|
config = AutoConfig.from_pretrained(checkpoint)
|
|
@@ -159,10 +159,10 @@ def load_model(model_name):
|
|
|
model = AutoModelForCausalLM.from_pretrained(checkpoint, **params)
|
|
model = AutoModelForCausalLM.from_pretrained(checkpoint, **params)
|
|
|
|
|
|
|
|
# Loading the tokenizer
|
|
# Loading the tokenizer
|
|
|
- if shared.model_name.lower().startswith(('gpt4chan', 'gpt-4chan', '4chan')) and Path("models/gpt-j-6B/").exists():
|
|
|
|
|
- tokenizer = AutoTokenizer.from_pretrained(Path("models/gpt-j-6B/"))
|
|
|
|
|
|
|
+ if shared.model_name.lower().startswith(('gpt4chan', 'gpt-4chan', '4chan')) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
|
|
|
|
|
+ tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
|
|
|
else:
|
|
else:
|
|
|
- tokenizer = AutoTokenizer.from_pretrained(Path(f"models/{shared.model_name}/"))
|
|
|
|
|
|
|
+ tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}/"))
|
|
|
tokenizer.truncation_side = 'left'
|
|
tokenizer.truncation_side = 'left'
|
|
|
|
|
|
|
|
print(f"Loaded the model in {(time.time()-t0):.2f} seconds.")
|
|
print(f"Loaded the model in {(time.time()-t0):.2f} seconds.")
|