更新PEFT库以支持MMOELORA类型，修改训练脚本以适应新配置，增强数据集处理逻辑，添加适配器注入功能，扩展PEFT类型枚举

2025-01-02 17:48:13 +08:00 · 2025-01-02 17:48:13 +08:00 · 2062f90e5d
commit 2062f90e5d
parent 2cd1bb4993
9 changed files with 3192 additions and 78 deletions
--- a/src/collatefn_library/qwen2.py
+++ b/src/collatefn_library/qwen2.py
@ -1,4 +1,5 @@
 from transformers import Qwen2VLProcessor
+import torch


 def collate_fn_for_train(examples, processor: Qwen2VLProcessor):
@ -51,6 +52,7 @@ def collate_fn_for_train(examples, processor: Qwen2VLProcessor):
                        now_index += 1
            now_index += 1
    batch["labels"] = labels
+    # batch["task_id"] = torch.tensor([0] * len(labels), dtype=torch.long)

    return batch

--- a/src/dataset_library/OCRVQADataset.py
+++ b/src/dataset_library/OCRVQADataset.py
@ -17,9 +17,9 @@ class OCRVQADataset(Dataset):
        self.vis_processor = vis_processor
        self.text_processor = text_processor
        if split == "train":
-            self.data = self.create_data(ann_path, split=1)[:1]
+            self.data = self.create_data(ann_path, split=1)[:200]
        elif split == "test":
-            self.data = self.create_data(ann_path, split=3)[:1]
+            self.data = self.create_data(ann_path, split=3)[:200]

        # self.instruction_pool = [
        #     "[vqa] {}",
--- a/src/peft_library/init.py
+++ b/src/peft_library/init.py
@ -1 +1 @@
-from .mapping import get_peft_config, get_peft_model
+from .mapping import get_peft_config, get_peft_model, inject_adapter_in_model
--- a/src/peft_library/mapping.py
+++ b/src/peft_library/mapping.py
@ -23,7 +23,7 @@ from peft.tuners.xlora.model import XLoraModel

 from peft.config import PeftConfig
 from peft.mixed_model import PeftMixedModel
-from peft.peft_model import (
+from .peft_model import (
    PeftModel,
    PeftModelForCausalLM,
    PeftModelForFeatureExtraction,
@ -280,9 +280,9 @@ def inject_adapter_in_model(
    # By instantiating a peft model we are injecting randomly initialized LoRA layers into the model's modules.
    peft_model = tuner_cls(
        model,
-        peft_config,
+        {adapter_name: peft_config},
        adapter_name=adapter_name,
        low_cpu_mem_usage=low_cpu_mem_usage,
    )
-
+    print("ok")
    return peft_model.model
--- a/src/peft_library/peft_model.py
+++ b/src/peft_library/peft_model.py
--- a/src/peft_library/tuners/mmoelora/mmoelora.py
+++ b/src/peft_library/tuners/mmoelora/mmoelora.py
@ -12,7 +12,9 @@ from transformers.pytorch_utils import Conv1D

 from peft_library.utils.peft_types import PeftType

-from peft_library.utils.constants import TRANSFORMERS_MODELS_TO_MMOELORA_TARGET_MODULES_MAPPING
+from peft_library.utils.constants import (
+    TRANSFORMERS_MODELS_TO_MMOELORA_TARGET_MODULES_MAPPING,
+)
 from peft.utils.other import _freeze_adapter, _get_submodules, transpose


@ -23,7 +25,6 @@ from peft.tuners.lora import (
 )


-
 def is_bnb_available():
    return importlib.util.find_spec("bitsandbytes") is not None

@ -47,12 +48,17 @@ class MMOELoraModel(LoraModel):
    Create MMOELoRA (MMOE based LoRA) model from a pretrained transformers model.
    """

-    def __init__(self, model, config, adapter_name):
+    def __init__(self, model, config, adapter_name, **kwargs):
+        # LoraModel.__init__(self, model, config, adapter_name, **kwargs)
        nn.Module.__init__(self)
        self.model = model
        self.forward = self.model.forward
        self.peft_config = config
-        self.add_adapter(adapter_name, self.peft_config[adapter_name])
+        # self.add_adapter(adapter_name, self.peft_config[adapter_name])
+
+        import sys; print(__file__, sys._getframe().f_lineno)
+        self.add_adapter(adapter_name, config=self.peft_config[adapter_name])
+        import sys; print(__file__, sys._getframe().f_lineno)

    def add_adapter(self, adapter_name, config=None):
        if config is not None:  # get the lora config
@ -64,14 +70,35 @@ class MMOELoraModel(LoraModel):
            config = self._prepare_mmoelora_config(config, model_config)  # load config
            self.peft_config[adapter_name] = config  # subsititue the original config
        self._find_and_replace(adapter_name)
+
+
        if len(self.peft_config) > 1 and self.peft_config[adapter_name].bias != "none":
            raise ValueError(
                "MMOELoraModel supports only 1 adapter with bias. When using multiple adapters, set bias to 'none' for all adapters."
            )
+        print(self.peft_config)
+        self.mark_only_lora_as_trainable(self.model, self.peft_config[adapter_name].bias)

-        self._mark_only_adapters_as_trainable(self.model)
        if self.peft_config[adapter_name].inference_mode:
            _freeze_adapter(self.model, adapter_name)
+            
+    def mark_only_lora_as_trainable(self,model: nn.Module, bias: str = "none") -> None:
+        """Only activate the LoRA layer as trainable"""
+        for n, p in model.named_parameters():
+            if "lora_" not in n:
+                p.requires_grad = False
+        if bias == "none":
+            return
+        elif bias == "all":
+            for n, p in model.named_parameters():
+                if "bias" in n:
+                    p.requires_grad = True
+        elif bias == "lora_only":
+            for m in model.modules():
+                if isinstance(m, LoraLayer) and hasattr(m, "bias") and m.bias is not None:
+                    m.bias.requires_grad = True
+        else:
+            raise NotImplementedError

    def _find_and_replace(self, adapter_name):
        """Replace the target `Linear` module with LoRA layer (Linear+LoRA)"""
@ -106,8 +133,10 @@ class MMOELoraModel(LoraModel):
                    for target_key in lora_config.target_modules
                )
            if target_module_found:
+
                if not is_target_modules_in_base_model:
                    is_target_modules_in_base_model = True
+
                parent, target, target_name = _get_submodules(self.model, key)
                bias = target.bias is not None
                if isinstance(target, MMOELoraLayer):
@ -122,6 +151,8 @@ class MMOELoraModel(LoraModel):
                    if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt):
                        raise NotImplementedError
                    else:
+                        # debug print
+
                        if isinstance(target, torch.nn.Linear):
                            in_features, out_features = (
                                target.in_features,
@ -154,11 +185,18 @@ class MMOELoraModel(LoraModel):
                                f"Target module {target} is not supported. "
                                f"Currently, only `torch.nn.Linear` and `Conv1D` are supported."
                            )
+
                        new_module = MMOELoraLinear(
-                            adapter_name, in_features, out_features, bias=bias, **kwargs
+                            adapter_name,
+                            in_features,
+                            out_features,
+                            bias=bias,
+                            base_layer=target,
+                            **kwargs,
                        )

                    self._replace_module(parent, target_name, new_module, target)
+
        if not is_target_modules_in_base_model:
            raise ValueError(
                f"Target modules {lora_config.target_modules} not found in the base model. "
@ -192,9 +230,16 @@ class MMOELoraModel(LoraModel):

 class MMOELoraLayer(LoraLayer):

-    def __init__(self, in_features: int, out_features: int, expert_num: int):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        expert_num: int,
+        base_layer: nn.Linear = None,
+    ):
+        super().__init__(base_layer=base_layer)

-        super().__init__(in_features, out_features)
+        self.in_features, self.out_features = in_features, out_features
        self.expert_num = expert_num

    def update_layer(
@ -235,7 +280,7 @@ class MMOELoraLayer(LoraLayer):
                nn.init.zeros_(self.lora_B[adapter_name].loraB[i].mlp.weight)


-class MMOELoraLinear(nn.Linear, MMOELoraLayer):
+class MMOELoraLinear(nn.Module, MMOELoraLayer):
    # Lora implemented in a dense layer
    # nn.Linear is the pretrained weights in LLM, MMOELoraLayer is the designed trainable Lora
    def __init__(
@ -243,25 +288,28 @@ class MMOELoraLinear(nn.Linear, MMOELoraLayer):
        adapter_name: str,
        in_features: int,
        out_features: int,
+        base_layer: nn.Linear = None,
        r: int = 0,
        lora_alpha: int = 1,
        lora_dropout: float = 0.0,
        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
        **kwargs,
    ):
+        nn.Module.__init__(self)
        init_lora_weights = kwargs.pop("init_lora_weights", True)
-        self.expert_num = kwargs.pop("expert_num", True)
        self.task_num = kwargs.pop("task_num", True)
        self.te_dim = kwargs.pop("task_embedding_dim", True)

-        nn.Linear.__init__(self, in_features, out_features, **kwargs)
        MMOELoraLayer.__init__(
            self,
            in_features=in_features,
            out_features=out_features,
-            expert_num=self.expert_num,
+            expert_num=kwargs.pop("expert_num", 2),
+            base_layer=base_layer,
        )

+        # nn.Linear.__init__(self, in_features, out_features, **kwargs)
+
        # init the Gate network
        self.lora_task_embedding = nn.ModuleDict({})
        self.lora_gate = nn.ModuleDict({})
@ -279,100 +327,90 @@ class MMOELoraLinear(nn.Linear, MMOELoraLayer):
        if fan_in_fan_out:
            self.weight.data = self.weight.data.T

-        nn.Linear.reset_parameters(self)
        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
-        self.active_adapter = adapter_name
+        self._active_adapter = adapter_name

    def merge(self, task_id):
-        if self.active_adapter not in self.lora_A.keys():
+        if self._active_adapter not in self.lora_A.keys():
            return
        if self.merged:
            warnings.warn("Already merged. Nothing to do.")
            return
-        if self.r[self.active_adapter] > 0:
-            expert_weight = self.lora_gate[self.active_adapter](
-                self.lora_task_embedding[self.active_adapter](task_id)
+        if self.r[self._active_adapter] > 0:
+            expert_weight = self.lora_gate[self._active_adapter](
+                self.lora_task_embedding[self._active_adapter](task_id)
            )
            for i in range(self.expert_num):
-                lora_A_weights = self.lora_A[self.active_adapter].loraA[i].mlp.weight
-                lora_B_weights = self.lora_B[self.active_adapter].loraB[i].mlp.weight
-                self.weight.data += (
+                lora_A_weights = self.lora_A[self._active_adapter].loraA[i].mlp.weight
+                lora_B_weights = self.lora_B[self._active_adapter].loraB[i].mlp.weight
+                self.base_layer.weight.data += (
                    transpose(
                        lora_B_weights @ lora_A_weights,
                        self.fan_in_fan_out,
                    )
-                    * self.scaling[self.active_adapter]
+                    * self.scaling[self._active_adapter]
                    * expert_weight[..., i]
                )
            self.merged = True

    def unmerge(self, task_id):
-        if self.active_adapter not in self.lora_A.keys():
+        if self._active_adapter not in self.lora_A.keys():
            return
        if not self.merged:
            warnings.warn("Already unmerged. Nothing to do.")
            return
-        if self.r[self.active_adapter] > 0:
-            expert_weight = self.lora_gate[self.active_adapter](
-                self.lora_task_embedding[self.active_adapter](task_id)
+        if self.r[self._active_adapter] > 0:
+            expert_weight = self.lora_gate[self._active_adapter](
+                self.lora_task_embedding[self._active_adapter](task_id)
            )
            for i in range(self.expert_num):
-                lora_A_weights = self.lora_A[self.active_adapter].loraA[i].mlp.weight
-                lora_B_weights = self.lora_B[self.active_adapter].loraB[i].mlp.weight
-                self.weight.data -= (
+                lora_A_weights = self.lora_A[self._active_adapter].loraA[i].mlp.weight
+                lora_B_weights = self.lora_B[self._active_adapter].loraB[i].mlp.weight
+                self.base_layer.weight.data -= (
                    transpose(
                        lora_B_weights @ lora_A_weights,
                        self.fan_in_fan_out,
                    )
-                    * self.scaling[self.active_adapter]
+                    * self.scaling[self._active_adapter]
                    * expert_weight[..., i]
                )
            self.merged = False

    def forward(self, x: torch.Tensor, **kwargs):
-        task_id = kwargs["task_id"]
+        # task_id = kwargs["task_id"]
+        for k,v in kwargs.items():
+            print(k, v.shape)
+        task_id = torch.tensor([0] * len(x), dtype=torch.long).to(x.device)
        previous_dtype = x.dtype

        if (
-            self.active_adapter not in self.lora_A.keys()
+            self._active_adapter not in self.lora_A.keys()
        ):  # No adapter, directly use linear
-            return F.linear(
-                x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
-            )
+            return self.base_layer(x)
        if self.disable_adapters:  # No adapter
            if (
-                self.r[self.active_adapter] > 0 and self.merged
+                self.r[self._active_adapter] > 0 and self.merged
            ):  # merge the adapter to linear
                self.unmerge(task_id)
-            result = F.linear(
-                x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
-            )
+            self.base_layer.weight.data = self.weight.data
        elif (
-            self.r[self.active_adapter] > 0 and not self.merged
+            self.r[self._active_adapter] > 0 and not self.merged
        ):  # general lora process
-            result = F.linear(
-                x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
-            )
+            result = self.base_layer(x)

-            x = x.to(self.lora_A[self.active_adapter].loraA[0].weight.dtype)
+            x = x.to(self.lora_A[self._active_adapter].loraA[0].weight.dtype)

-            expert_weight = self.lora_gate[self.active_adapter](
-                self.lora_task_embedding[self.active_adapter](task_id)
+            expert_weight = self.lora_gate[self._active_adapter](
+                self.lora_task_embedding[self._active_adapter](task_id)
            )
            for i in range(self.expert_num):
-                result += (  # lora process
-                    self.lora_B[self.active_adapter].loraB[i](
-                        self.lora_A[self.active_adapter].loraA[i](
-                            self.lora_dropout[self.active_adapter](x)
-                        ),
-                    )
-                    * self.scaling[self.active_adapter]
-                    * expert_weight[..., i].unsqueeze(-1).unsqueeze(0)
+                result += (
+                    self.lora_B[self._active_adapter].loraB[i](self.lora_A[self._active_adapter].loraA[i](self.lora_dropout[self._active_adapter](x)))
+                    * self.scaling[self._active_adapter]
+                    * expert_weight[..., i].view(-1, 1, 1)
                )
        else:
-            result = F.linear(
-                x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
-            )
+            result = self.base_layer(x)

        result = result.to(previous_dtype)

--- a/src/peft_library/utils/config.py
+++ b/src/peft_library/utils/config.py
@ -26,13 +26,29 @@ from .other import CONFIG_NAME

 class PeftType(str, enum.Enum):
    PROMPT_TUNING = "PROMPT_TUNING"
+    MULTITASK_PROMPT_TUNING = "MULTITASK_PROMPT_TUNING"
    P_TUNING = "P_TUNING"
    PREFIX_TUNING = "PREFIX_TUNING"
    LORA = "LORA"
    ADALORA = "ADALORA"
+    BOFT = "BOFT"
    ADAPTION_PROMPT = "ADAPTION_PROMPT"
+    IA3 = "IA3"
+    LOHA = "LOHA"
+    LOKR = "LOKR"
+    OFT = "OFT"
+    POLY = "POLY"
+    LN_TUNING = "LN_TUNING"
+    VERA = "VERA"
+    FOURIERFT = "FOURIERFT"
+    XLORA = "XLORA"
+    HRA = "HRA"
+    VBLORA = "VBLORA"
+    CPT = "CPT"
+    BONE = "BONE"
+    
    MMOELORAS = "MMOELORAS"
-
+    MMOELORA = "MMOELORA"

 class TaskType(str, enum.Enum):
    SEQ_CLS = "SEQ_CLS"
--- a/src/train.py
+++ b/src/train.py
@ -1,6 +1,6 @@
 import torch
 from dataset_library.factory import get_dataset
-from transformers import AutoModelForVision2Seq, AutoProcessor, TrainingArguments
+from transformers import AutoModelForVision2Seq, AutoProcessor, TrainingArguments, Qwen2VLForConditionalGeneration

 from trl import (
    ModelConfig,
@ -9,7 +9,7 @@ from trl import (
    # get_peft_config,
    get_quantization_config,
 )
-from peft_library import get_peft_model, get_peft_config
+from peft_library import get_peft_model, get_peft_config, inject_adapter_in_model

 from utils.trainer import ContinualTrainer
 from utils.args import ContinualScriptArguments, ContinualModelConfig
@ -30,16 +30,6 @@ if __name__ == "__main__":
    training_args.dataset_kwargs = {"skip_prepare_dataset": True}

    # peft_config = get_peft_config(dict(**vars(model_args)))
-    if model_args.peft_type == "MMOELora":
-        from peft_library.tuners import MMOELoraConfig
-
-        peft_config = MMOELoraConfig(target_modules=model_args.lora_target_modules)
-    elif model_args.peft_type == "LORA":
-        from peft.tuners.lora import LoraConfig
-
-        peft_config = LoraConfig(target_modules=model_args.lora_target_modules)
-    else:
-        peft_config = None

    torch_dtype = (
        model_args.torch_dtype
@ -84,11 +74,26 @@ if __name__ == "__main__":

    accelerator = create_accelerator_and_postprocess(training_args)

-    if peft_config is not None:
+    if model_args.peft_type == "MMOELORA":
+        from peft_library.tuners import MMOELoraConfig
+
+        peft_config = MMOELoraConfig(target_modules=model_args.lora_target_modules)
+        
+        # model = get_peft_model(model, peft_config)
+        model = inject_adapter_in_model(peft_config, model)
+        print(model)
+    elif model_args.peft_type == "LORA":
+        from peft.tuners.lora import LoraConfig
+
+        peft_config = LoraConfig(target_modules=model_args.lora_target_modules)
+
        model = get_peft_model(model, peft_config)
        if accelerator.is_local_main_process:
            model.print_trainable_parameters()

+    else:
+        peft_config = None
+
    for dataset_name in script_args.dataset_name:
        dataset = get_dataset(dataset_name)
        model.train()
--- a/src/train.sh
+++ b/src/train.sh
@ -3,7 +3,7 @@
 accelerate launch --config_file configs/accelerate_configs/deepspeed_zero2.yaml train.py \
    --dataset_name OCR_VQA_200K OCR_VQA_200K OCR_VQA_200K \
    --use_peft \
-    --peft_type LORA \
+    --peft_type MMOELORA \
    --model_name_or_path Qwen/Qwen2-VL-7B-Instruct \
    --lora_target_modules q_proj v_proj \
    --per_device_train_batch_size 1 \