download-model.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. '''
  2. Downloads models from Hugging Face to models/model-name.
  3. Example:
  4. python download-model.py facebook/opt-1.3b
  5. '''
  6. import argparse
  7. import base64
  8. import datetime
  9. import hashlib
  10. import json
  11. import re
  12. import sys
  13. from pathlib import Path
  14. import requests
  15. import tqdm
  16. from tqdm.contrib.concurrent import thread_map
  17. def select_model_from_default_options():
  18. models = {
  19. "OPT 6.7B": ("facebook", "opt-6.7b", "main"),
  20. "OPT 2.7B": ("facebook", "opt-2.7b", "main"),
  21. "OPT 1.3B": ("facebook", "opt-1.3b", "main"),
  22. "OPT 350M": ("facebook", "opt-350m", "main"),
  23. "GALACTICA 6.7B": ("facebook", "galactica-6.7b", "main"),
  24. "GALACTICA 1.3B": ("facebook", "galactica-1.3b", "main"),
  25. "GALACTICA 125M": ("facebook", "galactica-125m", "main"),
  26. "Pythia-6.9B-deduped": ("EleutherAI", "pythia-6.9b-deduped", "main"),
  27. "Pythia-2.8B-deduped": ("EleutherAI", "pythia-2.8b-deduped", "main"),
  28. "Pythia-1.4B-deduped": ("EleutherAI", "pythia-1.4b-deduped", "main"),
  29. "Pythia-410M-deduped": ("EleutherAI", "pythia-410m-deduped", "main"),
  30. }
  31. choices = {}
  32. print("Select the model that you want to download:\n")
  33. for i, name in enumerate(models):
  34. char = chr(ord('A') + i)
  35. choices[char] = name
  36. print(f"{char}) {name}")
  37. char = chr(ord('A') + len(models))
  38. print(f"{char}) None of the above")
  39. print()
  40. print("Input> ", end='')
  41. choice = input()[0].strip().upper()
  42. if choice == char:
  43. print("""\nThen type the name of your desired Hugging Face model in the format organization/name.
  44. Examples:
  45. facebook/opt-1.3b
  46. EleutherAI/pythia-1.4b-deduped
  47. """)
  48. print("Input> ", end='')
  49. model = input()
  50. branch = "main"
  51. else:
  52. arr = models[choices[choice]]
  53. model = f"{arr[0]}/{arr[1]}"
  54. branch = arr[2]
  55. return model, branch
  56. def sanitize_model_and_branch_names(model, branch):
  57. if model[-1] == '/':
  58. model = model[:-1]
  59. if branch is None:
  60. branch = "main"
  61. else:
  62. pattern = re.compile(r"^[a-zA-Z0-9._-]+$")
  63. if not pattern.match(branch):
  64. raise ValueError("Invalid branch name. Only alphanumeric characters, period, underscore and dash are allowed.")
  65. return model, branch
  66. def get_download_links_from_huggingface(model, branch, text_only=False):
  67. base = "https://huggingface.co"
  68. page = f"/api/models/{model}/tree/{branch}?cursor="
  69. cursor = b""
  70. links = []
  71. sha256 = []
  72. classifications = []
  73. has_pytorch = False
  74. has_pt = False
  75. has_ggml = False
  76. has_safetensors = False
  77. is_lora = False
  78. while True:
  79. content = requests.get(f"{base}{page}{cursor.decode()}").content
  80. dict = json.loads(content)
  81. if len(dict) == 0:
  82. break
  83. for i in range(len(dict)):
  84. fname = dict[i]['path']
  85. if not is_lora and fname.endswith(('adapter_config.json', 'adapter_model.bin')):
  86. is_lora = True
  87. is_pytorch = re.match("(pytorch|adapter)_model.*\.bin", fname)
  88. is_safetensors = re.match(".*\.safetensors", fname)
  89. is_pt = re.match(".*\.pt", fname)
  90. is_ggml = re.match("ggml.*\.bin", fname)
  91. is_tokenizer = re.match("tokenizer.*\.model", fname)
  92. is_text = re.match(".*\.(txt|json|py|md)", fname) or is_tokenizer
  93. if any((is_pytorch, is_safetensors, is_pt, is_ggml, is_tokenizer, is_text)):
  94. if 'lfs' in dict[i]:
  95. sha256.append([fname, dict[i]['lfs']['oid']])
  96. if is_text:
  97. links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
  98. classifications.append('text')
  99. continue
  100. if not text_only:
  101. links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
  102. if is_safetensors:
  103. has_safetensors = True
  104. classifications.append('safetensors')
  105. elif is_pytorch:
  106. has_pytorch = True
  107. classifications.append('pytorch')
  108. elif is_pt:
  109. has_pt = True
  110. classifications.append('pt')
  111. elif is_ggml:
  112. has_ggml = True
  113. classifications.append('ggml')
  114. cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
  115. cursor = base64.b64encode(cursor)
  116. cursor = cursor.replace(b'=', b'%3D')
  117. # If both pytorch and safetensors are available, download safetensors only
  118. if (has_pytorch or has_pt) and has_safetensors:
  119. for i in range(len(classifications) - 1, -1, -1):
  120. if classifications[i] in ['pytorch', 'pt']:
  121. links.pop(i)
  122. return links, sha256, is_lora
  123. def get_output_folder(model, branch, is_lora, base_folder=None):
  124. if base_folder is None:
  125. base_folder = 'models' if not is_lora else 'loras'
  126. output_folder = f"{'_'.join(model.split('/')[-2:])}"
  127. if branch != 'main':
  128. output_folder += f'_{branch}'
  129. output_folder = Path(base_folder) / output_folder
  130. return output_folder
  131. def get_single_file(url, output_folder, start_from_scratch=False):
  132. filename = Path(url.rsplit('/', 1)[1])
  133. output_path = output_folder / filename
  134. if output_path.exists() and not start_from_scratch:
  135. # Check if the file has already been downloaded completely
  136. r = requests.get(url, stream=True)
  137. total_size = int(r.headers.get('content-length', 0))
  138. if output_path.stat().st_size >= total_size:
  139. return
  140. # Otherwise, resume the download from where it left off
  141. headers = {'Range': f'bytes={output_path.stat().st_size}-'}
  142. mode = 'ab'
  143. else:
  144. headers = {}
  145. mode = 'wb'
  146. r = requests.get(url, stream=True, headers=headers)
  147. with open(output_path, mode) as f:
  148. total_size = int(r.headers.get('content-length', 0))
  149. block_size = 1024
  150. with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t:
  151. for data in r.iter_content(block_size):
  152. t.update(len(data))
  153. f.write(data)
  154. def start_download_threads(file_list, output_folder, start_from_scratch=False, threads=1):
  155. thread_map(lambda url: get_single_file(url, output_folder, start_from_scratch=start_from_scratch), file_list, max_workers=threads, disable=True)
  156. def download_model_files(model, branch, links, sha256, output_folder, start_from_scratch=False, threads=1):
  157. # Creating the folder and writing the metadata
  158. if not output_folder.exists():
  159. output_folder.mkdir()
  160. with open(output_folder / 'huggingface-metadata.txt', 'w') as f:
  161. f.write(f'url: https://huggingface.co/{model}\n')
  162. f.write(f'branch: {branch}\n')
  163. f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n')
  164. sha256_str = ''
  165. for i in range(len(sha256)):
  166. sha256_str += f' {sha256[i][1]} {sha256[i][0]}\n'
  167. if sha256_str != '':
  168. f.write(f'sha256sum:\n{sha256_str}')
  169. # Downloading the files
  170. print(f"Downloading the model to {output_folder}")
  171. start_download_threads(links, output_folder, start_from_scratch=start_from_scratch, threads=threads)
  172. def check_model_files(model, branch, links, sha256, output_folder):
  173. # Validate the checksums
  174. validated = True
  175. for i in range(len(sha256)):
  176. fpath = (output_folder / sha256[i][0])
  177. if not fpath.exists():
  178. print(f"The following file is missing: {fpath}")
  179. validated = False
  180. continue
  181. with open(output_folder / sha256[i][0], "rb") as f:
  182. bytes = f.read()
  183. file_hash = hashlib.sha256(bytes).hexdigest()
  184. if file_hash != sha256[i][1]:
  185. print(f'Checksum failed: {sha256[i][0]} {sha256[i][1]}')
  186. validated = False
  187. else:
  188. print(f'Checksum validated: {sha256[i][0]} {sha256[i][1]}')
  189. if validated:
  190. print('[+] Validated checksums of all model files!')
  191. else:
  192. print('[-] Invalid checksums. Rerun download-model.py with the --clean flag.')
  193. if __name__ == '__main__':
  194. parser = argparse.ArgumentParser()
  195. parser.add_argument('MODEL', type=str, default=None, nargs='?')
  196. parser.add_argument('--branch', type=str, default='main', help='Name of the Git branch to download from.')
  197. parser.add_argument('--threads', type=int, default=1, help='Number of files to download simultaneously.')
  198. parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).')
  199. parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.')
  200. parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.')
  201. parser.add_argument('--check', action='store_true', help='Validates the checksums of model files.')
  202. args = parser.parse_args()
  203. branch = args.branch
  204. model = args.MODEL
  205. if model is None:
  206. model, branch = select_model_from_default_options()
  207. # Cleaning up the model/branch names
  208. try:
  209. model, branch = sanitize_model_and_branch_names(model, branch)
  210. except ValueError as err_branch:
  211. print(f"Error: {err_branch}")
  212. sys.exit()
  213. # Getting the download links from Hugging Face
  214. links, sha256, is_lora = get_download_links_from_huggingface(model, branch, text_only=args.text_only)
  215. # Getting the output folder
  216. output_folder = get_output_folder(model, branch, is_lora, base_folder=args.output)
  217. if args.check:
  218. # Check previously downloaded files
  219. check_model_files(model, branch, links, sha256, output_folder)
  220. else:
  221. # Download files
  222. download_model_files(model, branch, links, sha256, output_folder, threads=args.threads)