关于 Hugging Face开源库accelerate 详解:https://zhuanlan.zhihu.com/p/646610811

Huggingface,Accelerate文档介绍:https://huggingface.co/docs/accelerate/index

示例介绍:http://www.360doc.com/content/23/0206/22/7673502_1066492586.shtml

代码示例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# Accelerate,33B的LLaMA模型,多卡推理代码
# LLaMA-33B 一般需要66G 显存,每张卡允许使用35G显存空间

from transformers import LlamaConfig,LlamaForCausalLM,LlamaTokenizer
from accelerate import init_empty_weights,infer_auto_device_map,load_checkpoint_in_model,dispatch_model
import torch

cuda_list = '6,7'.split(',')
# 显存控制
memory = '35GiB'
model_path = 'xxx'
no_split_module_classes = LlamaForCausalLM._no_split_modules

max_memory = {int(cuda):memory for cuda in cuda_list}
config = LlamaConfig.from_pretrained(model_path)
with init_empty_weights():
model = LlamaForCausalLM._from_config(config, torch_dtype=torch.float16) #加载到meta设备中,不需要耗时,不需要消耗内存和显存

device_map = infer_auto_device_map(model, max_memory=max_memory,no_split_module_classes=no_split_module_classes) #自动划分每个层的设备
load_checkpoint_in_model(model,model_path,device_map=device_map) #加载权重
model = dispatch_model(model,device_map=device_map) #并分配到具体的设备上

tokenizer = LlamaTokenizer.from_pretrained(model_path)
torch.set_grad_enabled(False)
model.eval()
sents=['你是谁']
ids = tokenizer(sents,max_length=1800,padding=True,truncation=True,return_tensors="pt")
ids = ids.to(model.device)
outputs = model.generate(**ids, do_sample=False)