高级开发指南
本指南介绍大型语言模型微调的高级技术和策略,帮助开发者实现更高效、更有效的模型定制。
高级微调架构
混合精度训练
混合精度训练可显著加速微调过程,同时降低内存需求:
from accelerate import Accelerator
# 初始化加速器
accelerator = Accelerator(mixed_precision="bf16") # 或 "fp16"
# 准备模型、优化器和数据加载器
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
# 训练循环
for epoch in range(num_epochs):
for batch in train_dataloader:
with accelerator.accumulate(model):
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
optimizer.zero_grad()
多卡分布式训练
对于大型模型,使用多GPU并行训练:
# 初始化分布式环境
accelerator = Accelerator(
gradient_accumulation_steps=2,
mixed_precision="bf16",
log_with="wandb"
)
# 配置DeepSpeed ZeRO-3
from accelerate import DeepSpeedPlugin
deepspeed_plugin = DeepSpeedPlugin(
zero_stage=3,
gradient_clipping=1.0,
offload_optimizer_device="cpu",
offload_param_device="cpu"
)
accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin)
高级LoRA技术
条件LoRA (C-LoRA)
根据任务类型动态调整LoRA适配器:
from peft import LoraConfig, TaskType, PeftModel, get_peft_model
# 创建多个LoRA配置
lora_config_qa = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=32,
lora_alpha=64,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
)
lora_config_summarization = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
lora_dropout=0.10,
)
# 为不同任务创建适配器
model_qa = get_peft_model(base_model.clone(), lora_config_qa)
model_summarization = get_peft_model(base_model.clone(), lora_config_summarization)
# 根据输入条件选择适配器
def select_adapter(input_text):
if "总结" in input_text or "概括" in input_text:
return model_summarization
else:
return model_qa
LoRA适配器的量化与合并
from peft import PeftModel
# 加载基础模型和LoRA适配器
base_model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-13b-hf",
device_map="auto",
load_in_8bit=True, # 8位量化
)
peft_model = PeftModel.from_pretrained(base_model, "./lora_adapters/qa_adapter")
# 合并适配器并保存量化版本
merged_model = peft_model.merge_and_unload()
# 保存为GPTQ 4位量化模型
from transformers import GPTQConfig
quantization_config = GPTQConfig(bits=4, group_size=128)
merged_model.save_pretrained(
"./merged_quantized_model",
quantization_config=quantization_config
)
高级数据处理技术
语义去重
减少数据集中的冗余示例:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
# 加载句子编码器
encoder = SentenceTransformer('all-MiniLM-L6-v2')
# 计算所有示例的嵌入
df = pd.read_csv('training_data.csv')
texts = [f"{row['instruction']} {row['input']}" for _, row in df.iterrows()]
embeddings = encoder.encode(texts)
# 计算相似度矩阵
similarity_matrix = cosine_similarity(embeddings)
# 标记要保留的示例
threshold = 0.92 # 相似度阈值
to_keep = set(range(len(texts)))
for i in range(len(texts)):
if i not in to_keep:
continue
for j in range(i + 1, len(texts)):
if similarity_matrix[i, j] > threshold:
to_keep.discard(j) # 移除相似的示例
# 过滤数据集
filtered_df = df.iloc[list(to_keep)].reset_index(drop=True)
print(f"原始数据集: {len(df)}行, 过滤后: {len(filtered_df)}行")
filtered_df.to_csv('deduped_data.csv', index=False)
渐进式微调排序
根据难度排序微调数据:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
# 加载模型与分词器
model_id = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
# 计算困惑度(衡量难度)
def calculate_perplexity(text):
inputs = tokenizer(text, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model(**inputs)
loss = outputs.loss
return torch.exp(loss).item()
# 评估数据集中每个示例的难度
df = pd.read_csv('training_data.csv')
difficulties = []
for _, row in df.iterrows():
prompt = f"### 指令:\n{row['instruction']}\n\n### 输入:\n{row['input']}\n\n### 响应:\n"
target = row['output']
perplexity = calculate_perplexity(prompt + target)
difficulties.append(perplexity)
df['difficulty'] = difficulties
# 按难度排序(从简单到复杂)
df_sorted = df.sort_values('difficulty').reset_index(drop=True)
df_sorted.to_csv('curriculum_data.csv', index=False)
RLHF高级技术
构建奖励模型
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
from datasets import Dataset
import torch
# 准备奖励模型数据
# 结构: [{"chosen": "好回答", "rejected": "差回答"}, ...]
reward_data = [...]
reward_dataset = Dataset.from_list(reward_data)
# 准备模型
model_id = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(
model_id,
num_labels=1, # 单一分数输出
device_map="auto"
)
# 预处理函数
def preprocess_reward_data(examples):
chosen_inputs = tokenizer(examples["chosen"], truncation=True, padding="max_length", max_length=512)
rejected_inputs = tokenizer(examples["rejected"], truncation=True, padding="max_length", max_length=512)
features = {
"input_ids_chosen": chosen_inputs["input_ids"],
"attention_mask_chosen": chosen_inputs["attention_mask"],
"input_ids_rejected": rejected_inputs["input_ids"],
"attention_mask_rejected": rejected_inputs["attention_mask"],
}
return features
tokenized_dataset = reward_dataset.map(preprocess_reward_data, batched=True)
# 自定义数据整理函数
def collate_fn(examples):
chosen_batch = {
"input_ids": torch.tensor([ex["input_ids_chosen"] for ex in examples]),
"attention_mask": torch.tensor([ex["attention_mask_chosen"] for ex in examples]),
}
rejected_batch = {
"input_ids": torch.tensor([ex["input_ids_rejected"] for ex in examples]),
"attention_mask": torch.tensor([ex["attention_mask_rejected"] for ex in examples]),
}
return chosen_batch, rejected_batch
# 自定义训练器
class RewardTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
chosen_batch, rejected_batch = inputs
chosen_rewards = model(**chosen_batch).logits
rejected_rewards = model(**rejected_batch).logits
# 计算奖励差异的loss (希望chosen > rejected)
loss = -torch.log(torch.sigmoid(chosen_rewards - rejected_rewards)).mean()
if return_outputs:
return loss, chosen_rewards
return loss
# 训练奖励模型
trainer = RewardTrainer(
model=model,
args=training_args,
data_collator=collate_fn,
train_dataset=tokenized_dataset,
)
trainer.train()