更新PEFT库以支持MMOELORA类型,修改训练脚本以适应新配置,增强数据集处理逻辑,添加适配器注入功能,扩展PEFT类型枚举

This commit is contained in:
YunyaoZhou 2025-01-02 17:48:13 +08:00
parent 2cd1bb4993
commit 2062f90e5d
Signed by: shujakuin
GPG Key ID: 418C3CA28E350CCF
9 changed files with 3192 additions and 78 deletions

View File

@ -1,4 +1,5 @@
from transformers import Qwen2VLProcessor
import torch
def collate_fn_for_train(examples, processor: Qwen2VLProcessor):
@ -51,6 +52,7 @@ def collate_fn_for_train(examples, processor: Qwen2VLProcessor):
now_index += 1
now_index += 1
batch["labels"] = labels
# batch["task_id"] = torch.tensor([0] * len(labels), dtype=torch.long)
return batch

View File

@ -17,9 +17,9 @@ class OCRVQADataset(Dataset):
self.vis_processor = vis_processor
self.text_processor = text_processor
if split == "train":
self.data = self.create_data(ann_path, split=1)[:1]
self.data = self.create_data(ann_path, split=1)[:200]
elif split == "test":
self.data = self.create_data(ann_path, split=3)[:1]
self.data = self.create_data(ann_path, split=3)[:200]
# self.instruction_pool = [
# "[vqa] {}",

View File

@ -1 +1 @@
from .mapping import get_peft_config, get_peft_model
from .mapping import get_peft_config, get_peft_model, inject_adapter_in_model

View File

@ -23,7 +23,7 @@ from peft.tuners.xlora.model import XLoraModel
from peft.config import PeftConfig
from peft.mixed_model import PeftMixedModel
from peft.peft_model import (
from .peft_model import (
PeftModel,
PeftModelForCausalLM,
PeftModelForFeatureExtraction,
@ -280,9 +280,9 @@ def inject_adapter_in_model(
# By instantiating a peft model we are injecting randomly initialized LoRA layers into the model's modules.
peft_model = tuner_cls(
model,
peft_config,
{adapter_name: peft_config},
adapter_name=adapter_name,
low_cpu_mem_usage=low_cpu_mem_usage,
)
print("ok")
return peft_model.model

File diff suppressed because it is too large Load Diff

View File

@ -12,7 +12,9 @@ from transformers.pytorch_utils import Conv1D
from peft_library.utils.peft_types import PeftType
from peft_library.utils.constants import TRANSFORMERS_MODELS_TO_MMOELORA_TARGET_MODULES_MAPPING
from peft_library.utils.constants import (
TRANSFORMERS_MODELS_TO_MMOELORA_TARGET_MODULES_MAPPING,
)
from peft.utils.other import _freeze_adapter, _get_submodules, transpose
@ -23,7 +25,6 @@ from peft.tuners.lora import (
)
def is_bnb_available():
return importlib.util.find_spec("bitsandbytes") is not None
@ -47,12 +48,17 @@ class MMOELoraModel(LoraModel):
Create MMOELoRA (MMOE based LoRA) model from a pretrained transformers model.
"""
def __init__(self, model, config, adapter_name):
def __init__(self, model, config, adapter_name, **kwargs):
# LoraModel.__init__(self, model, config, adapter_name, **kwargs)
nn.Module.__init__(self)
self.model = model
self.forward = self.model.forward
self.peft_config = config
self.add_adapter(adapter_name, self.peft_config[adapter_name])
# self.add_adapter(adapter_name, self.peft_config[adapter_name])
import sys; print(__file__, sys._getframe().f_lineno)
self.add_adapter(adapter_name, config=self.peft_config[adapter_name])
import sys; print(__file__, sys._getframe().f_lineno)
def add_adapter(self, adapter_name, config=None):
if config is not None: # get the lora config
@ -64,14 +70,35 @@ class MMOELoraModel(LoraModel):
config = self._prepare_mmoelora_config(config, model_config) # load config
self.peft_config[adapter_name] = config # subsititue the original config
self._find_and_replace(adapter_name)
if len(self.peft_config) > 1 and self.peft_config[adapter_name].bias != "none":
raise ValueError(
"MMOELoraModel supports only 1 adapter with bias. When using multiple adapters, set bias to 'none' for all adapters."
)
print(self.peft_config)
self.mark_only_lora_as_trainable(self.model, self.peft_config[adapter_name].bias)
self._mark_only_adapters_as_trainable(self.model)
if self.peft_config[adapter_name].inference_mode:
_freeze_adapter(self.model, adapter_name)
def mark_only_lora_as_trainable(self,model: nn.Module, bias: str = "none") -> None:
"""Only activate the LoRA layer as trainable"""
for n, p in model.named_parameters():
if "lora_" not in n:
p.requires_grad = False
if bias == "none":
return
elif bias == "all":
for n, p in model.named_parameters():
if "bias" in n:
p.requires_grad = True
elif bias == "lora_only":
for m in model.modules():
if isinstance(m, LoraLayer) and hasattr(m, "bias") and m.bias is not None:
m.bias.requires_grad = True
else:
raise NotImplementedError
def _find_and_replace(self, adapter_name):
"""Replace the target `Linear` module with LoRA layer (Linear+LoRA)"""
@ -106,8 +133,10 @@ class MMOELoraModel(LoraModel):
for target_key in lora_config.target_modules
)
if target_module_found:
if not is_target_modules_in_base_model:
is_target_modules_in_base_model = True
parent, target, target_name = _get_submodules(self.model, key)
bias = target.bias is not None
if isinstance(target, MMOELoraLayer):
@ -122,6 +151,8 @@ class MMOELoraModel(LoraModel):
if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt):
raise NotImplementedError
else:
# debug print
if isinstance(target, torch.nn.Linear):
in_features, out_features = (
target.in_features,
@ -154,11 +185,18 @@ class MMOELoraModel(LoraModel):
f"Target module {target} is not supported. "
f"Currently, only `torch.nn.Linear` and `Conv1D` are supported."
)
new_module = MMOELoraLinear(
adapter_name, in_features, out_features, bias=bias, **kwargs
adapter_name,
in_features,
out_features,
bias=bias,
base_layer=target,
**kwargs,
)
self._replace_module(parent, target_name, new_module, target)
if not is_target_modules_in_base_model:
raise ValueError(
f"Target modules {lora_config.target_modules} not found in the base model. "
@ -192,9 +230,16 @@ class MMOELoraModel(LoraModel):
class MMOELoraLayer(LoraLayer):
def __init__(self, in_features: int, out_features: int, expert_num: int):
def __init__(
self,
in_features: int,
out_features: int,
expert_num: int,
base_layer: nn.Linear = None,
):
super().__init__(base_layer=base_layer)
super().__init__(in_features, out_features)
self.in_features, self.out_features = in_features, out_features
self.expert_num = expert_num
def update_layer(
@ -235,7 +280,7 @@ class MMOELoraLayer(LoraLayer):
nn.init.zeros_(self.lora_B[adapter_name].loraB[i].mlp.weight)
class MMOELoraLinear(nn.Linear, MMOELoraLayer):
class MMOELoraLinear(nn.Module, MMOELoraLayer):
# Lora implemented in a dense layer
# nn.Linear is the pretrained weights in LLM, MMOELoraLayer is the designed trainable Lora
def __init__(
@ -243,25 +288,28 @@ class MMOELoraLinear(nn.Linear, MMOELoraLayer):
adapter_name: str,
in_features: int,
out_features: int,
base_layer: nn.Linear = None,
r: int = 0,
lora_alpha: int = 1,
lora_dropout: float = 0.0,
fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
**kwargs,
):
nn.Module.__init__(self)
init_lora_weights = kwargs.pop("init_lora_weights", True)
self.expert_num = kwargs.pop("expert_num", True)
self.task_num = kwargs.pop("task_num", True)
self.te_dim = kwargs.pop("task_embedding_dim", True)
nn.Linear.__init__(self, in_features, out_features, **kwargs)
MMOELoraLayer.__init__(
self,
in_features=in_features,
out_features=out_features,
expert_num=self.expert_num,
expert_num=kwargs.pop("expert_num", 2),
base_layer=base_layer,
)
# nn.Linear.__init__(self, in_features, out_features, **kwargs)
# init the Gate network
self.lora_task_embedding = nn.ModuleDict({})
self.lora_gate = nn.ModuleDict({})
@ -279,100 +327,90 @@ class MMOELoraLinear(nn.Linear, MMOELoraLayer):
if fan_in_fan_out:
self.weight.data = self.weight.data.T
nn.Linear.reset_parameters(self)
self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
self.active_adapter = adapter_name
self._active_adapter = adapter_name
def merge(self, task_id):
if self.active_adapter not in self.lora_A.keys():
if self._active_adapter not in self.lora_A.keys():
return
if self.merged:
warnings.warn("Already merged. Nothing to do.")
return
if self.r[self.active_adapter] > 0:
expert_weight = self.lora_gate[self.active_adapter](
self.lora_task_embedding[self.active_adapter](task_id)
if self.r[self._active_adapter] > 0:
expert_weight = self.lora_gate[self._active_adapter](
self.lora_task_embedding[self._active_adapter](task_id)
)
for i in range(self.expert_num):
lora_A_weights = self.lora_A[self.active_adapter].loraA[i].mlp.weight
lora_B_weights = self.lora_B[self.active_adapter].loraB[i].mlp.weight
self.weight.data += (
lora_A_weights = self.lora_A[self._active_adapter].loraA[i].mlp.weight
lora_B_weights = self.lora_B[self._active_adapter].loraB[i].mlp.weight
self.base_layer.weight.data += (
transpose(
lora_B_weights @ lora_A_weights,
self.fan_in_fan_out,
)
* self.scaling[self.active_adapter]
* self.scaling[self._active_adapter]
* expert_weight[..., i]
)
self.merged = True
def unmerge(self, task_id):
if self.active_adapter not in self.lora_A.keys():
if self._active_adapter not in self.lora_A.keys():
return
if not self.merged:
warnings.warn("Already unmerged. Nothing to do.")
return
if self.r[self.active_adapter] > 0:
expert_weight = self.lora_gate[self.active_adapter](
self.lora_task_embedding[self.active_adapter](task_id)
if self.r[self._active_adapter] > 0:
expert_weight = self.lora_gate[self._active_adapter](
self.lora_task_embedding[self._active_adapter](task_id)
)
for i in range(self.expert_num):
lora_A_weights = self.lora_A[self.active_adapter].loraA[i].mlp.weight
lora_B_weights = self.lora_B[self.active_adapter].loraB[i].mlp.weight
self.weight.data -= (
lora_A_weights = self.lora_A[self._active_adapter].loraA[i].mlp.weight
lora_B_weights = self.lora_B[self._active_adapter].loraB[i].mlp.weight
self.base_layer.weight.data -= (
transpose(
lora_B_weights @ lora_A_weights,
self.fan_in_fan_out,
)
* self.scaling[self.active_adapter]
* self.scaling[self._active_adapter]
* expert_weight[..., i]
)
self.merged = False
def forward(self, x: torch.Tensor, **kwargs):
task_id = kwargs["task_id"]
# task_id = kwargs["task_id"]
for k,v in kwargs.items():
print(k, v.shape)
task_id = torch.tensor([0] * len(x), dtype=torch.long).to(x.device)
previous_dtype = x.dtype
if (
self.active_adapter not in self.lora_A.keys()
self._active_adapter not in self.lora_A.keys()
): # No adapter, directly use linear
return F.linear(
x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
)
return self.base_layer(x)
if self.disable_adapters: # No adapter
if (
self.r[self.active_adapter] > 0 and self.merged
self.r[self._active_adapter] > 0 and self.merged
): # merge the adapter to linear
self.unmerge(task_id)
result = F.linear(
x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
)
self.base_layer.weight.data = self.weight.data
elif (
self.r[self.active_adapter] > 0 and not self.merged
self.r[self._active_adapter] > 0 and not self.merged
): # general lora process
result = F.linear(
x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
)
result = self.base_layer(x)
x = x.to(self.lora_A[self.active_adapter].loraA[0].weight.dtype)
x = x.to(self.lora_A[self._active_adapter].loraA[0].weight.dtype)
expert_weight = self.lora_gate[self.active_adapter](
self.lora_task_embedding[self.active_adapter](task_id)
expert_weight = self.lora_gate[self._active_adapter](
self.lora_task_embedding[self._active_adapter](task_id)
)
for i in range(self.expert_num):
result += ( # lora process
self.lora_B[self.active_adapter].loraB[i](
self.lora_A[self.active_adapter].loraA[i](
self.lora_dropout[self.active_adapter](x)
),
)
* self.scaling[self.active_adapter]
* expert_weight[..., i].unsqueeze(-1).unsqueeze(0)
result += (
self.lora_B[self._active_adapter].loraB[i](self.lora_A[self._active_adapter].loraA[i](self.lora_dropout[self._active_adapter](x)))
* self.scaling[self._active_adapter]
* expert_weight[..., i].view(-1, 1, 1)
)
else:
result = F.linear(
x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
)
result = self.base_layer(x)
result = result.to(previous_dtype)

View File

@ -26,13 +26,29 @@ from .other import CONFIG_NAME
class PeftType(str, enum.Enum):
PROMPT_TUNING = "PROMPT_TUNING"
MULTITASK_PROMPT_TUNING = "MULTITASK_PROMPT_TUNING"
P_TUNING = "P_TUNING"
PREFIX_TUNING = "PREFIX_TUNING"
LORA = "LORA"
ADALORA = "ADALORA"
BOFT = "BOFT"
ADAPTION_PROMPT = "ADAPTION_PROMPT"
IA3 = "IA3"
LOHA = "LOHA"
LOKR = "LOKR"
OFT = "OFT"
POLY = "POLY"
LN_TUNING = "LN_TUNING"
VERA = "VERA"
FOURIERFT = "FOURIERFT"
XLORA = "XLORA"
HRA = "HRA"
VBLORA = "VBLORA"
CPT = "CPT"
BONE = "BONE"
MMOELORAS = "MMOELORAS"
MMOELORA = "MMOELORA"
class TaskType(str, enum.Enum):
SEQ_CLS = "SEQ_CLS"

View File

@ -1,6 +1,6 @@
import torch
from dataset_library.factory import get_dataset
from transformers import AutoModelForVision2Seq, AutoProcessor, TrainingArguments
from transformers import AutoModelForVision2Seq, AutoProcessor, TrainingArguments, Qwen2VLForConditionalGeneration
from trl import (
ModelConfig,
@ -9,7 +9,7 @@ from trl import (
# get_peft_config,
get_quantization_config,
)
from peft_library import get_peft_model, get_peft_config
from peft_library import get_peft_model, get_peft_config, inject_adapter_in_model
from utils.trainer import ContinualTrainer
from utils.args import ContinualScriptArguments, ContinualModelConfig
@ -30,16 +30,6 @@ if __name__ == "__main__":
training_args.dataset_kwargs = {"skip_prepare_dataset": True}
# peft_config = get_peft_config(dict(**vars(model_args)))
if model_args.peft_type == "MMOELora":
from peft_library.tuners import MMOELoraConfig
peft_config = MMOELoraConfig(target_modules=model_args.lora_target_modules)
elif model_args.peft_type == "LORA":
from peft.tuners.lora import LoraConfig
peft_config = LoraConfig(target_modules=model_args.lora_target_modules)
else:
peft_config = None
torch_dtype = (
model_args.torch_dtype
@ -84,11 +74,26 @@ if __name__ == "__main__":
accelerator = create_accelerator_and_postprocess(training_args)
if peft_config is not None:
if model_args.peft_type == "MMOELORA":
from peft_library.tuners import MMOELoraConfig
peft_config = MMOELoraConfig(target_modules=model_args.lora_target_modules)
# model = get_peft_model(model, peft_config)
model = inject_adapter_in_model(peft_config, model)
print(model)
elif model_args.peft_type == "LORA":
from peft.tuners.lora import LoraConfig
peft_config = LoraConfig(target_modules=model_args.lora_target_modules)
model = get_peft_model(model, peft_config)
if accelerator.is_local_main_process:
model.print_trainable_parameters()
else:
peft_config = None
for dataset_name in script_args.dataset_name:
dataset = get_dataset(dataset_name)
model.train()

View File

@ -3,7 +3,7 @@
accelerate launch --config_file configs/accelerate_configs/deepspeed_zero2.yaml train.py \
--dataset_name OCR_VQA_200K OCR_VQA_200K OCR_VQA_200K \
--use_peft \
--peft_type LORA \
--peft_type MMOELORA \
--model_name_or_path Qwen/Qwen2-VL-7B-Instruct \
--lora_target_modules q_proj v_proj \
--per_device_train_batch_size 1 \