from transformers import LlamaConfig,LlamaForCausalLM,LlamaTokenizer from accelerate import init_empty_weights,infer_auto_device_map,load_checkpoint_in_model,dispatch_model import torch
max_memory = {int(cuda):memory for cuda in cuda_list} config = LlamaConfig.from_pretrained(model_path) with init_empty_weights(): model = LlamaForCausalLM._from_config(config, torch_dtype=torch.float16) #加载到meta设备中,不需要耗时,不需要消耗内存和显存