14-accelerate

关于 Hugging Face开源库accelerate 详解：https://zhuanlan.zhihu.com/p/646610811

Huggingface，Accelerate文档介绍：https://huggingface.co/docs/accelerate/index

示例介绍：http://www.360doc.com/content/23/0206/22/7673502_1066492586.shtml

代码示例

# Accelerate，33B的LLaMA模型，多卡推理代码
# LLaMA-33B 一般需要66G 显存，每张卡允许使用35G显存空间

from transformers import LlamaConfig,LlamaForCausalLM,LlamaTokenizer
from accelerate import init_empty_weights,infer_auto_device_map,load_checkpoint_in_model,dispatch_model
import torch

cuda_list = '6,7'.split(',')
# 显存控制
memory = '35GiB'
model_path = 'xxx'
no_split_module_classes = LlamaForCausalLM._no_split_modules

max_memory = {int(cuda):memory for cuda in cuda_list}
config = LlamaConfig.from_pretrained(model_path)
with init_empty_weights():
    model = LlamaForCausalLM._from_config(config, torch_dtype=torch.float16) #加载到meta设备中，不需要耗时，不需要消耗内存和显存

device_map = infer_auto_device_map(model, max_memory=max_memory,no_split_module_classes=no_split_module_classes) #自动划分每个层的设备
load_checkpoint_in_model(model,model_path,device_map=device_map) #加载权重
model = dispatch_model(model,device_map=device_map) #并分配到具体的设备上

tokenizer = LlamaTokenizer.from_pretrained(model_path)
torch.set_grad_enabled(False)
model.eval()
sents=['你是谁']
ids = tokenizer(sents,max_length=1800,padding=True,truncation=True,return_tensors="pt")
ids = ids.to(model.device) 
outputs = model.generate(**ids, do_sample=False)