1. 本地部署
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3-7b")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3-7b")
input_text = "请介绍一下你自己"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs, max_length=100)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
2. 量化部署
from auto_gptq import AutoGPTQForCausalLM
model = AutoGPTQForCausalLM.from_quantized(
"meta-llama/Llama-3-7b-GPTQ-4bit",
use_triton=True
)
3. 分布式部署
deepspeed --num_gpus=4 train.py \
--model_name_or_path meta-llama/Llama-3-7b \
--deepspeed ds_config.json