添加PEFT库的初始化文件，更新数据集导入路径，修改训练脚本以支持新的PEFT类型和配置，新增持续学习模型配置类，添加PEFT类型枚举，更新评估和训练逻辑以适应新结构

2025-01-02 02:44:58 +08:00 · 2025-01-02 02:44:58 +08:00 · 2cd1bb4993
commit 2cd1bb4993
parent aef0f6834e
26 changed files with 1673 additions and 20 deletions
--- a/src/collatefn_library/qwen2.py
+++ b/src/collatefn_library/qwen2.py
@ -84,7 +84,7 @@ def collate_fn_for_evaluate(examples, processor: Qwen2VLProcessor):
 if __name__ == "__main__":
    from transformers import Qwen2VLProcessor
-    from datasets_library.OCRVQADataset import OCRVQADatasetForGeneration
+    from dataset_library.OCRVQADataset import OCRVQADatasetForGeneration
    processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
    dataset = OCRVQADatasetForGeneration(
--- a/src/configs/accelerate_configs/deepspeed_zero1.yaml
+++ b/src/configs/accelerate_configs/deepspeed_zero1.yaml
--- a/src/configs/accelerate_configs/deepspeed_zero2.yaml
+++ b/src/configs/accelerate_configs/deepspeed_zero2.yaml
--- a/src/configs/accelerate_configs/deepspeed_zero3.yaml
+++ b/src/configs/accelerate_configs/deepspeed_zero3.yaml
--- a/src/configs/accelerate_configs/fsdp_qlora.yaml
+++ b/src/configs/accelerate_configs/fsdp_qlora.yaml
--- a/src/configs/accelerate_configs/multi_gpu.yaml
+++ b/src/configs/accelerate_configs/multi_gpu.yaml
--- a/src/configs/accelerate_configs/single_gpu.yaml
+++ b/src/configs/accelerate_configs/single_gpu.yaml
--- a/src/datasets_library/OCRVQADataset.py
+++ b/src/datasets_library/OCRVQADataset.py
--- a/src/datasets_library/init.py
+++ b/src/datasets_library/init.py
--- a/src/datasets_library/factory.py
+++ b/src/datasets_library/factory.py
--- a/src/evaluation.py
+++ b/src/evaluation.py
@ -1,5 +1,5 @@
 import torch
-from datasets_library.factory import get_dataset
+from dataset_library.factory import get_dataset
 from transformers import AutoModelForVision2Seq, AutoProcessor, TrainingArguments
 from trl import (
--- a/src/peft_library/init.py
+++ b/src/peft_library/init.py
@ -0,0 +1 @@
 from .mapping import get_peft_config, get_peft_model
--- a/src/peft_library/mapping.py
+++ b/src/peft_library/mapping.py
@ -0,0 +1,288 @@
 # Copyright 2023-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
 import warnings
 from typing import TYPE_CHECKING, Any, Optional
 import torch
 from peft.tuners.xlora.model import XLoraModel
 from peft.config import PeftConfig
 from peft.mixed_model import PeftMixedModel
 from peft.peft_model import (
    PeftModel,
    PeftModelForCausalLM,
    PeftModelForFeatureExtraction,
    PeftModelForQuestionAnswering,
    PeftModelForSeq2SeqLM,
    PeftModelForSequenceClassification,
    PeftModelForTokenClassification,
 )
 from peft.tuners import (
    AdaLoraConfig,
    AdaLoraModel,
    AdaptionPromptConfig,
    BOFTConfig,
    BOFTModel,
    BoneConfig,
    BoneModel,
    CPTConfig,
    CPTEmbedding,
    FourierFTConfig,
    FourierFTModel,
    HRAConfig,
    HRAModel,
    IA3Config,
    IA3Model,
    LNTuningConfig,
    LNTuningModel,
    LoHaConfig,
    LoHaModel,
    LoKrConfig,
    LoKrModel,
    LoraConfig,
    LoraModel,
    MultitaskPromptTuningConfig,
    OFTConfig,
    OFTModel,
    PolyConfig,
    PolyModel,
    PrefixTuningConfig,
    PromptEncoderConfig,
    PromptTuningConfig,
    VBLoRAConfig,
    VBLoRAModel,
    VeraConfig,
    VeraModel,
    XLoraConfig,
 )
 from .tuners import MMOELoraConfigS, MMOELoraModelS, MMOELoraModel, MMOELoraConfig
 from peft.tuners.tuners_utils import BaseTuner
 from peft.utils import _prepare_prompt_learning_config
 from peft.utils.constants import PEFT_TYPE_TO_PREFIX_MAPPING
 if TYPE_CHECKING:
    from transformers import PreTrainedModel
 MODEL_TYPE_TO_PEFT_MODEL_MAPPING: dict[str, type[PeftModel]] = {
    "SEQ_CLS": PeftModelForSequenceClassification,
    "SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM,
    "CAUSAL_LM": PeftModelForCausalLM,
    "TOKEN_CLS": PeftModelForTokenClassification,
    "QUESTION_ANS": PeftModelForQuestionAnswering,
    "FEATURE_EXTRACTION": PeftModelForFeatureExtraction,
 }
 PEFT_TYPE_TO_CONFIG_MAPPING: dict[str, type[PeftConfig]] = {
    "ADAPTION_PROMPT": AdaptionPromptConfig,
    "PROMPT_TUNING": PromptTuningConfig,
    "PREFIX_TUNING": PrefixTuningConfig,
    "P_TUNING": PromptEncoderConfig,
    "LORA": LoraConfig,
    "LOHA": LoHaConfig,
    "LORAPLUS": LoraConfig,
    "LOKR": LoKrConfig,
    "ADALORA": AdaLoraConfig,
    "BOFT": BOFTConfig,
    "IA3": IA3Config,
    "MULTITASK_PROMPT_TUNING": MultitaskPromptTuningConfig,
    "OFT": OFTConfig,
    "POLY": PolyConfig,
    "LN_TUNING": LNTuningConfig,
    "VERA": VeraConfig,
    "FOURIERFT": FourierFTConfig,
    "XLORA": XLoraConfig,
    "HRA": HRAConfig,
    "VBLORA": VBLoRAConfig,
    "CPT": CPTConfig,
    "BONE": BoneConfig,
    "MMOELORA": MMOELoraConfig,
    "MMOELORAS": MMOELoraConfigS,
 }
 PEFT_TYPE_TO_TUNER_MAPPING: dict[str, type[BaseTuner]] = {
    "LORA": LoraModel,
    "LOHA": LoHaModel,
    "LOKR": LoKrModel,
    "ADALORA": AdaLoraModel,
    "BOFT": BOFTModel,
    "IA3": IA3Model,
    "OFT": OFTModel,
    "POLY": PolyModel,
    "LN_TUNING": LNTuningModel,
    "VERA": VeraModel,
    "FOURIERFT": FourierFTModel,
    "XLORA": XLoraModel,
    "HRA": HRAModel,
    "VBLORA": VBLoRAModel,
    "CPT": CPTEmbedding,
    "BONE": BoneModel,
    "MMOELORA": MMOELoraModel,
    "MMOELORAS": MMOELoraModelS,
 }
 def get_peft_config(config_dict: dict[str, Any]) -> PeftConfig:
    """
    Returns a Peft config object from a dictionary.
    Args:
        config_dict (`Dict[str, Any]`): Dictionary containing the configuration parameters.
    """
    return PEFT_TYPE_TO_CONFIG_MAPPING[config_dict["peft_type"]](**config_dict)
 def get_peft_model(
    model: PreTrainedModel,
    peft_config: PeftConfig,
    adapter_name: str = "default",
    mixed: bool = False,
    autocast_adapter_dtype: bool = True,
    revision: Optional[str] = None,
    low_cpu_mem_usage: bool = False,
 ) -> PeftModel | PeftMixedModel:
    """
    Returns a Peft model object from a model and a config.
    Args:
        model ([`transformers.PreTrainedModel`]):
            Model to be wrapped.
        peft_config ([`PeftConfig`]):
            Configuration object containing the parameters of the Peft model.
        adapter_name (`str`, `optional`, defaults to `"default"`):
            The name of the adapter to be injected, if not provided, the default adapter name is used ("default").
        mixed (`bool`, `optional`, defaults to `False`):
            Whether to allow mixing different (compatible) adapter types.
        autocast_adapter_dtype (`bool`, *optional*):
            Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights
            using float16 or bfloat16 to float32, as this is typically required for stable training, and only affect
            select PEFT tuners.
        revision (`str`, `optional`, defaults to `main`):
            The revision of the base model. If this isn't set, the saved peft model will load the `main` revision for
            the base model
        low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
            Create empty adapter weights on meta device. Useful to speed up the loading process. Leave this setting as
            False if you intend on training the model, unless the adapter weights will be replaced by different weights
            before training starts.
    """
    model_config = BaseTuner.get_model_config(model)
    old_name = peft_config.base_model_name_or_path
    new_name = model.__dict__.get("name_or_path", None)
    peft_config.base_model_name_or_path = new_name
    if (old_name is not None) and (old_name != new_name):
        warnings.warn(
            f"The PEFT config's `base_model_name_or_path` was renamed from '{old_name}' to '{new_name}'. "
            "Please ensure that the correct base model is loaded when loading this checkpoint."
        )
    if revision is not None:
        if peft_config.revision is not None and peft_config.revision != revision:
            warnings.warn(
                f"peft config has already set base model revision to {peft_config.revision}, overwriting with revision {revision}"
            )
        peft_config.revision = revision
    if (
        (isinstance(peft_config, PEFT_TYPE_TO_CONFIG_MAPPING["LORA"]))
        and (peft_config.init_lora_weights == "eva")
        and not low_cpu_mem_usage
    ):
        warnings.warn(
            "lora with eva initialization used with low_cpu_mem_usage=False. "
            "Setting low_cpu_mem_usage=True can improve the maximum batch size possible for eva initialization."
        )
    prefix = PEFT_TYPE_TO_PREFIX_MAPPING.get(peft_config.peft_type)
    if prefix and adapter_name in prefix:
        warnings.warn(
            f"Adapter name {adapter_name} should not be contained in the prefix {prefix}."
            "This may lead to reinitialization of the adapter weights during loading."
        )
    if mixed:
        # note: PeftMixedModel does not support autocast_adapter_dtype, so don't pass it
        return PeftMixedModel(model, peft_config, adapter_name=adapter_name)
    if (
        peft_config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys()
        and not peft_config.is_prompt_learning
    ):
        return PeftModel(
            model,
            peft_config,
            adapter_name=adapter_name,
            autocast_adapter_dtype=autocast_adapter_dtype,
            low_cpu_mem_usage=low_cpu_mem_usage,
        )
    if peft_config.is_prompt_learning:
        peft_config = _prepare_prompt_learning_config(peft_config, model_config)
    return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](
        model,
        peft_config,
        adapter_name=adapter_name,
        autocast_adapter_dtype=autocast_adapter_dtype,
        low_cpu_mem_usage=low_cpu_mem_usage,
    )
 def inject_adapter_in_model(
    peft_config: PeftConfig,
    model: torch.nn.Module,
    adapter_name: str = "default",
    low_cpu_mem_usage: bool = False,
 ) -> torch.nn.Module:
    r"""
    A simple API to create and inject adapter in-place into a model. Currently the API does not support prompt learning
    methods and adaption prompt. Make sure to have the correct `target_names` set in the `peft_config` object. The API
    calls `get_peft_model` under the hood but would be restricted only to non-prompt learning methods.
    Args:
        peft_config (`PeftConfig`):
            Configuration object containing the parameters of the Peft model.
        model (`torch.nn.Module`):
            The input model where the adapter will be injected.
        adapter_name (`str`, `optional`, defaults to `"default"`):
            The name of the adapter to be injected, if not provided, the default adapter name is used ("default").
        low_cpu_mem_usage (`bool`, `optional`, defaults to `False`):
            Create empty adapter weights on meta device. Useful to speed up the loading process.
    """
    if peft_config.is_prompt_learning or peft_config.is_adaption_prompt:
        raise ValueError(
            "`create_and_replace` does not support prompt learning and adaption prompt yet."
        )
    if peft_config.peft_type not in PEFT_TYPE_TO_TUNER_MAPPING.keys():
        raise ValueError(
            f"`inject_adapter_in_model` does not support {peft_config.peft_type} yet. Please use `get_peft_model`."
        )
    tuner_cls = PEFT_TYPE_TO_TUNER_MAPPING[peft_config.peft_type]
    # By instantiating a peft model we are injecting randomly initialized LoRA layers into the model's modules.
    peft_model = tuner_cls(
        model,
        peft_config,
        adapter_name=adapter_name,
        low_cpu_mem_usage=low_cpu_mem_usage,
    )
    return peft_model.model
--- a/src/peft_library/tuners/init.py
+++ b/src/peft_library/tuners/init.py
@ -0,0 +1,2 @@
 from .mmoelora.mmoelora import MMOELoraModel, MMOELoraConfig
 from .mmoelora.mmoeloraS import MMOELoraModelS, MMOELoraConfigS
--- a/src/peft_library/tuners/mmoelora/init.py
+++ b/src/peft_library/tuners/mmoelora/init.py
--- a/src/peft_library/tuners/mmoelora/mmoelora.py
+++ b/src/peft_library/tuners/mmoelora/mmoelora.py
@ -0,0 +1,467 @@
 # -*- encoding: utf-8 -*-
 # here put the import lib
 import importlib
 import re
 import warnings
 from dataclasses import dataclass, field
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers.pytorch_utils import Conv1D
 from peft_library.utils.peft_types import PeftType
 from peft_library.utils.constants import TRANSFORMERS_MODELS_TO_MMOELORA_TARGET_MODULES_MAPPING
 from peft.utils.other import _freeze_adapter, _get_submodules, transpose
 from peft.tuners.lora import (
    LoraConfig,
    LoraLayer,
    LoraModel,
 )
 def is_bnb_available():
    return importlib.util.find_spec("bitsandbytes") is not None
@dataclass
 class MMOELoraConfig(LoraConfig):
    """
    This is the configuration class to store the configuration of a [`~peft.MMOELora`]
    """
    task_num: int = field(default=2, metadata={"help": "The number of tasks."})
    task_embedding_dim: int = field(default=64)
    expert_num: int = field(default=4)
    def __post_init__(self):
        self.peft_type = PeftType.MMOELORA
 class MMOELoraModel(LoraModel):
    """
    Create MMOELoRA (MMOE based LoRA) model from a pretrained transformers model.
    """
    def __init__(self, model, config, adapter_name):
        nn.Module.__init__(self)
        self.model = model
        self.forward = self.model.forward
        self.peft_config = config
        self.add_adapter(adapter_name, self.peft_config[adapter_name])
    def add_adapter(self, adapter_name, config=None):
        if config is not None:  # get the lora config
            model_config = (
                self.model.config.to_dict()
                if hasattr(self.model.config, "to_dict")
                else self.model.config
            )
            config = self._prepare_mmoelora_config(config, model_config)  # load config
            self.peft_config[adapter_name] = config  # subsititue the original config
        self._find_and_replace(adapter_name)
        if len(self.peft_config) > 1 and self.peft_config[adapter_name].bias != "none":
            raise ValueError(
                "MMOELoraModel supports only 1 adapter with bias. When using multiple adapters, set bias to 'none' for all adapters."
            )
        self._mark_only_adapters_as_trainable(self.model)
        if self.peft_config[adapter_name].inference_mode:
            _freeze_adapter(self.model, adapter_name)
    def _find_and_replace(self, adapter_name):
        """Replace the target `Linear` module with LoRA layer (Linear+LoRA)"""
        lora_config = self.peft_config[adapter_name]
        loaded_in_8bit = getattr(self.model, "is_loaded_in_8bit", False)
        if loaded_in_8bit and not is_bnb_available():
            raise ImportError(
                "To use Lora with 8-bit quantization, please install the `bitsandbytes` package. "
                "You can install it with `pip install bitsandbytes`."
            )
        is_target_modules_in_base_model = False
        kwargs = {
            "r": lora_config.r,
            "lora_alpha": lora_config.lora_alpha,
            "lora_dropout": lora_config.lora_dropout,
            "fan_in_fan_out": lora_config.fan_in_fan_out,
            "init_lora_weights": lora_config.init_lora_weights,
            "task_num": lora_config.task_num,
            "task_embedding_dim": lora_config.task_embedding_dim,
            "expert_num": lora_config.expert_num,
        }
        key_list = [
            key for key, _ in self.model.named_modules()
        ]  # all module in raw model
        for key in key_list:
            # find the corresponding modules. target module has been split into list.
            if isinstance(lora_config.target_modules, str):
                target_module_found = re.fullmatch(lora_config.target_modules, key)
            else:
                target_module_found = any(
                    key.endswith(target_key)
                    for target_key in lora_config.target_modules
                )
            if target_module_found:
                if not is_target_modules_in_base_model:
                    is_target_modules_in_base_model = True
                parent, target, target_name = _get_submodules(self.model, key)
                bias = target.bias is not None
                if isinstance(target, MMOELoraLayer):
                    target.update_layer(
                        adapter_name,
                        lora_config.init_r,
                        lora_config.lora_alpha,
                        lora_config.lora_dropout,
                        lora_config.init_lora_weights,
                    )
                else:
                    if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt):
                        raise NotImplementedError
                    else:
                        if isinstance(target, torch.nn.Linear):
                            in_features, out_features = (
                                target.in_features,
                                target.out_features,
                            )
                            if kwargs["fan_in_fan_out"]:
                                warnings.warn(
                                    "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
                                    "Setting fan_in_fan_out to False."
                                )
                                kwargs["fan_in_fan_out"] = (
                                    lora_config.fan_in_fan_out
                                ) = False
                        elif isinstance(target, Conv1D):
                            in_features, out_features = (
                                target.weight.ds_shape
                                if hasattr(target.weight, "ds_shape")
                                else target.weight.shape
                            )
                            if not kwargs["fan_in_fan_out"]:
                                warnings.warn(
                                    "fan_in_fan_out is set to False but the target module is `Conv1D`. "
                                    "Setting fan_in_fan_out to True."
                                )
                                kwargs["fan_in_fan_out"] = (
                                    lora_config.fan_in_fan_out
                                ) = True
                        else:
                            raise ValueError(
                                f"Target module {target} is not supported. "
                                f"Currently, only `torch.nn.Linear` and `Conv1D` are supported."
                            )
                        new_module = MMOELoraLinear(
                            adapter_name, in_features, out_features, bias=bias, **kwargs
                        )
                    self._replace_module(parent, target_name, new_module, target)
        if not is_target_modules_in_base_model:
            raise ValueError(
                f"Target modules {lora_config.target_modules} not found in the base model. "
                f"Please check the target modules and try again."
            )
    def __getattr__(self, name: str):
        """Forward missing attributes to the wrapped module."""
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
            return getattr(self.model, name)
    @staticmethod
    def _prepare_mmoelora_config(peft_config, model_config):
        if peft_config.target_modules is None:
            if (
                model_config["model_type"]
                not in TRANSFORMERS_MODELS_TO_MMOELORA_TARGET_MODULES_MAPPING
            ):
                raise ValueError("Please specify `target_modules` in `peft_config`")
            peft_config.target_modules = (
                TRANSFORMERS_MODELS_TO_MMOELORA_TARGET_MODULES_MAPPING[
                    model_config["model_type"]
                ]
            )
        if peft_config.inference_mode:
            peft_config.merge_weights = True
        return peft_config
 class MMOELoraLayer(LoraLayer):
    def __init__(self, in_features: int, out_features: int, expert_num: int):
        super().__init__(in_features, out_features)
        self.expert_num = expert_num
    def update_layer(
        self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights
    ):
        self.r[adapter_name] = r
        self.lora_alpha[adapter_name] = lora_alpha
        if lora_dropout > 0.0:
            lora_dropout_layer = nn.Dropout(p=lora_dropout)
        else:
            lora_dropout_layer = nn.Identity()
        self.lora_dropout.update(nn.ModuleDict({adapter_name: lora_dropout_layer}))
        # Actual trainable parameters
        if r > 0:
            self.lora_A.update(
                nn.ModuleDict(
                    {adapter_name: MMOELinearA(self.in_features, r, self.expert_num)}
                )
            )
            self.lora_B.update(
                nn.ModuleDict(
                    {adapter_name: MMOELinearB(r, self.out_features, self.expert_num)}
                )
            )
            self.scaling[adapter_name] = lora_alpha / r
        if init_lora_weights:
            self.reset_lora_parameters(adapter_name)
        self.to(self.weight.device)
    def reset_lora_parameters(self, adapter_name):
        if adapter_name in self.lora_A.keys():
            # initialize A the same way as the default for nn.Linear and B to zero
            for i in range(self.expert_num):
                nn.init.normal_(
                    self.lora_A[adapter_name].loraA[i].mlp.weight, mean=0.0, std=0.01
                )
                nn.init.zeros_(self.lora_B[adapter_name].loraB[i].mlp.weight)
 class MMOELoraLinear(nn.Linear, MMOELoraLayer):
    # Lora implemented in a dense layer
    # nn.Linear is the pretrained weights in LLM, MMOELoraLayer is the designed trainable Lora
    def __init__(
        self,
        adapter_name: str,
        in_features: int,
        out_features: int,
        r: int = 0,
        lora_alpha: int = 1,
        lora_dropout: float = 0.0,
        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
        **kwargs,
    ):
        init_lora_weights = kwargs.pop("init_lora_weights", True)
        self.expert_num = kwargs.pop("expert_num", True)
        self.task_num = kwargs.pop("task_num", True)
        self.te_dim = kwargs.pop("task_embedding_dim", True)
        nn.Linear.__init__(self, in_features, out_features, **kwargs)
        MMOELoraLayer.__init__(
            self,
            in_features=in_features,
            out_features=out_features,
            expert_num=self.expert_num,
        )
        # init the Gate network
        self.lora_task_embedding = nn.ModuleDict({})
        self.lora_gate = nn.ModuleDict({})
        self.lora_task_embedding.update(
            nn.ModuleDict({adapter_name: nn.Embedding(self.task_num + 1, self.te_dim)})
        )
        self.lora_gate.update(
            nn.ModuleDict({adapter_name: Gate(self.te_dim, self.expert_num)})
        )
        # Freezing the pre-trained weight matrix
        self.weight.requires_grad = False
        self.fan_in_fan_out = fan_in_fan_out
        if fan_in_fan_out:
            self.weight.data = self.weight.data.T
        nn.Linear.reset_parameters(self)
        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
        self.active_adapter = adapter_name
    def merge(self, task_id):
        if self.active_adapter not in self.lora_A.keys():
            return
        if self.merged:
            warnings.warn("Already merged. Nothing to do.")
            return
        if self.r[self.active_adapter] > 0:
            expert_weight = self.lora_gate[self.active_adapter](
                self.lora_task_embedding[self.active_adapter](task_id)
            )
            for i in range(self.expert_num):
                lora_A_weights = self.lora_A[self.active_adapter].loraA[i].mlp.weight
                lora_B_weights = self.lora_B[self.active_adapter].loraB[i].mlp.weight
                self.weight.data += (
                    transpose(
                        lora_B_weights @ lora_A_weights,
                        self.fan_in_fan_out,
                    )
                    * self.scaling[self.active_adapter]
                    * expert_weight[..., i]
                )
            self.merged = True
    def unmerge(self, task_id):
        if self.active_adapter not in self.lora_A.keys():
            return
        if not self.merged:
            warnings.warn("Already unmerged. Nothing to do.")
            return
        if self.r[self.active_adapter] > 0:
            expert_weight = self.lora_gate[self.active_adapter](
                self.lora_task_embedding[self.active_adapter](task_id)
            )
            for i in range(self.expert_num):
                lora_A_weights = self.lora_A[self.active_adapter].loraA[i].mlp.weight
                lora_B_weights = self.lora_B[self.active_adapter].loraB[i].mlp.weight
                self.weight.data -= (
                    transpose(
                        lora_B_weights @ lora_A_weights,
                        self.fan_in_fan_out,
                    )
                    * self.scaling[self.active_adapter]
                    * expert_weight[..., i]
                )
            self.merged = False
    def forward(self, x: torch.Tensor, **kwargs):
        task_id = kwargs["task_id"]
        previous_dtype = x.dtype
        if (
            self.active_adapter not in self.lora_A.keys()
        ):  # No adapter, directly use linear
            return F.linear(
                x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
            )
        if self.disable_adapters:  # No adapter
            if (
                self.r[self.active_adapter] > 0 and self.merged
            ):  # merge the adapter to linear
                self.unmerge(task_id)
            result = F.linear(
                x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
            )
        elif (
            self.r[self.active_adapter] > 0 and not self.merged
        ):  # general lora process
            result = F.linear(
                x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
            )
            x = x.to(self.lora_A[self.active_adapter].loraA[0].weight.dtype)
            expert_weight = self.lora_gate[self.active_adapter](
                self.lora_task_embedding[self.active_adapter](task_id)
            )
            for i in range(self.expert_num):
                result += (  # lora process
                    self.lora_B[self.active_adapter].loraB[i](
                        self.lora_A[self.active_adapter].loraA[i](
                            self.lora_dropout[self.active_adapter](x)
                        ),
                    )
                    * self.scaling[self.active_adapter]
                    * expert_weight[..., i].unsqueeze(-1).unsqueeze(0)
                )
        else:
            result = F.linear(
                x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
            )
        result = result.to(previous_dtype)
        return result
 class MMOELinearA(nn.Module):
    """MMOE based LoRA block"""
    def __init__(self, in_features, out_features, expert_num) -> None:
        super().__init__()
        self.expert_num = expert_num
        self.in_features, self.out_features = in_features, out_features
        self.loraA = nn.ModuleList([])
        assert (
            self.out_features % self.expert_num == 0
        )  # lora rank should be divided by expert number
        self.r = self.out_features // self.expert_num
        for _ in range(self.expert_num):
            self.loraA.append(Expert(self.in_features, self.r))
    def forward(self, x):
        """input x is a vector, return output is a list"""
        outputs = []
        for i in range(self.expert_num):
            outputs.append(self.loraA[i](x))
        return outputs
 class MMOELinearB(nn.Module):
    """MMOE based LoRA block"""
    def __init__(self, in_features, out_features, expert_num) -> None:
        super().__init__()
        self.expert_num = expert_num
        self.in_features, self.out_features = in_features, out_features
        self.loraB = nn.ModuleList([])
        assert self.in_features % self.expert_num == 0
        self.r = self.in_features // self.expert_num
        for _ in range(self.expert_num):
            self.loraB.append(Expert(self.r, self.out_features))
    def forward(self, x):
        """input x is a list, return output is also a list"""
        outputs = []
        for i in range(self.expert_num):
            outputs.append(self.loraB[i](x[i]))
        return outputs
 class Expert(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.in_features, self.out_features = in_features, out_features
        self.mlp = nn.Linear(self.in_features, self.out_features, bias=False)
        self.weight = self.mlp.weight
    def forward(self, x):
        # LoRA A or B block
        y = self.mlp(x)
        return y
 class Gate(nn.Module):
    def __init__(self, input_size, expert_num):
        super().__init__()
        # 使用embedding来代替线性层
        self.GateL = nn.Linear(input_size, expert_num, bias=False)
        self.act = nn.Softmax(dim=1)  # 第0维为batch size
    def forward(self, x):
        y = self.GateL(x)
        y = self.act(y)
        return y
--- a/src/peft_library/tuners/mmoelora/mmoeloraS.py
+++ b/src/peft_library/tuners/mmoelora/mmoeloraS.py
@ -0,0 +1,227 @@
 # -*- encoding: utf-8 -*-
 # here put the import lib
 import re
 import importlib
 import warnings
 from dataclasses import dataclass, field
 from .mmoelora import MMOELoraModel, MMOELoraLinear, MMOELoraLayer
 from peft.tuners.lora import LoraConfig
 import torch
 import torch.nn.functional as F
 from transformers.pytorch_utils import Conv1D
 # from ..utils import _get_submodules, transpose, PeftType
 from peft.utils.other import _get_submodules, transpose
 from peft.utils.peft_types import PeftType
 def is_bnb_available():
    return importlib.util.find_spec("bitsandbytes") is not None
 # TRANSFORMERS_MODELS_TO_MMOELORA_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
@dataclass
 class MMOELoraConfigS(LoraConfig):
    """
    This is the configuration class to store the configuration of a [`~peft.MMOELora`]
    """
    task_num: int = field(default=2, metadata={"help": "The number of tasks."})
    task_embedding_dim: int = field(default=64)
    expert_num: int = field(default=4)
    def __post_init__(self):
        self.peft_type = PeftType.MMOELORAS
 class MMOELoraModelS(MMOELoraModel):
    def __init__(self, model, config, adapter_name):
        super().__init__(model, config, adapter_name)
    def _find_and_replace(self, adapter_name):
        """Replace the target `Linear` module with LoRA layer (Linear+LoRA)"""
        lora_config = self.peft_config[adapter_name]
        loaded_in_8bit = getattr(self.model, "is_loaded_in_8bit", False)
        if loaded_in_8bit and not is_bnb_available():
            raise ImportError(
                "To use Lora with 8-bit quantization, please install the `bitsandbytes` package. "
                "You can install it with `pip install bitsandbytes`."
            )
        is_target_modules_in_base_model = False
        kwargs = {
            "r": lora_config.r,
            "lora_alpha": lora_config.lora_alpha,
            "lora_dropout": lora_config.lora_dropout,
            "fan_in_fan_out": lora_config.fan_in_fan_out,
            "init_lora_weights": lora_config.init_lora_weights,
            "task_num": lora_config.task_num,
            "task_embedding_dim": lora_config.task_embedding_dim,
            "expert_num": lora_config.expert_num,
        }
        key_list = [
            key for key, _ in self.model.named_modules()
        ]  # all module in raw model
        for key in key_list:
            # find the corresponding modules. target module has been split into list.
            if isinstance(lora_config.target_modules, str):
                target_module_found = re.fullmatch(lora_config.target_modules, key)
            else:
                target_module_found = any(
                    key.endswith(target_key)
                    for target_key in lora_config.target_modules
                )
            if target_module_found:
                if not is_target_modules_in_base_model:
                    is_target_modules_in_base_model = True
                parent, target, target_name = _get_submodules(self.model, key)
                bias = target.bias is not None
                if isinstance(target, MMOELoraLayer):
                    target.update_layer(
                        adapter_name,
                        lora_config.init_r,
                        lora_config.lora_alpha,
                        lora_config.lora_dropout,
                        lora_config.init_lora_weights,
                    )
                else:
                    if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt):
                        raise NotImplementedError
                    else:
                        if isinstance(target, torch.nn.Linear):
                            in_features, out_features = (
                                target.in_features,
                                target.out_features,
                            )
                            if kwargs["fan_in_fan_out"]:
                                warnings.warn(
                                    "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
                                    "Setting fan_in_fan_out to False."
                                )
                                kwargs["fan_in_fan_out"] = (
                                    lora_config.fan_in_fan_out
                                ) = False
                        elif isinstance(target, Conv1D):
                            in_features, out_features = (
                                target.weight.ds_shape
                                if hasattr(target.weight, "ds_shape")
                                else target.weight.shape
                            )
                            if not kwargs["fan_in_fan_out"]:
                                warnings.warn(
                                    "fan_in_fan_out is set to False but the target module is `Conv1D`. "
                                    "Setting fan_in_fan_out to True."
                                )
                                kwargs["fan_in_fan_out"] = (
                                    lora_config.fan_in_fan_out
                                ) = True
                        else:
                            raise ValueError(
                                f"Target module {target} is not supported. "
                                f"Currently, only `torch.nn.Linear` and `Conv1D` are supported."
                            )
                        new_module = MMOELoraLinearS(
                            adapter_name, in_features, out_features, bias=bias, **kwargs
                        )
                    self._replace_module(parent, target_name, new_module, target)
        if not is_target_modules_in_base_model:
            raise ValueError(
                f"Target modules {lora_config.target_modules} not found in the base model. "
                f"Please check the target modules and try again."
            )
 class MMOELoraLinearS(MMOELoraLinear):
    def __init__(
        self,
        adapter_name: str,
        in_features: int,
        out_features: int,
        r: int = 0,
        lora_alpha: int = 1,
        lora_dropout: float = 0,
        fan_in_fan_out: bool = False,
        **kwargs,
    ):
        super().__init__(
            adapter_name,
            in_features,
            out_features,
            r,
            lora_alpha,
            lora_dropout,
            fan_in_fan_out,
            **kwargs,
        )
    def unmerge(self, expert_weight):
        if self.active_adapter not in self.lora_A.keys():
            return
        if not self.merged:
            warnings.warn("Already unmerged. Nothing to do.")
            return
        if self.r[self.active_adapter] > 0:
            for i in range(self.expert_num):
                lora_A_weights = self.lora_A[self.active_adapter].loraA[i].mlp.weight
                lora_B_weights = self.lora_B[self.active_adapter].loraB[i].mlp.weight
                self.weight.data -= (
                    transpose(
                        lora_B_weights @ lora_A_weights,
                        self.fan_in_fan_out,
                    )
                    * self.scaling[self.active_adapter]
                    * expert_weight[..., i]
                )
            self.merged = False
    def forward(self, x: torch.Tensor, **kwargs):
        expert_weight = kwargs["task_id"]
        previous_dtype = x.dtype
        if (
            self.active_adapter not in self.lora_A.keys()
        ):  # No adapter, directly use linear
            return F.linear(
                x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
            )
        if self.disable_adapters:  # No adapter
            if (
                self.r[self.active_adapter] > 0 and self.merged
            ):  # merge the adapter to linear
                self.unmerge(expert_weight)
            result = F.linear(
                x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
            )
        elif (
            self.r[self.active_adapter] > 0 and not self.merged
        ):  # general lora process
            result = F.linear(
                x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
            )
            x = x.to(self.lora_A[self.active_adapter].loraA[0].weight.dtype)
            for i in range(self.expert_num):
                result += (  # lora process
                    self.lora_B[self.active_adapter].loraB[i](
                        self.lora_A[self.active_adapter].loraA[i](
                            self.lora_dropout[self.active_adapter](x)
                        ),
                    )
                    * self.scaling[self.active_adapter]
                    * expert_weight[..., i].unsqueeze(-1).unsqueeze(0)
                )
        else:
            result = F.linear(
                x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias
            )
        result = result.to(previous_dtype)
        return result
--- a/src/peft_library/utils/init.py
+++ b/src/peft_library/utils/init.py
@ -0,0 +1,39 @@
 # flake8: noqa
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all
 # coding=utf-8
 # Copyright 2023-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .config import PeftConfig, PeftType, PromptLearningConfig, TaskType
 from .other import (
    TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
    TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING,
    TRANSFORMERS_MODELS_TO_MMOELORAS_TARGET_MODULES_MAPPING,
    TRANSFORMERS_MODELS_TO_MMOELORA_TARGET_MODULES_MAPPING,
    CONFIG_NAME,
    WEIGHTS_NAME,
    _set_trainable,
    bloom_model_postprocess_past_key_value,
    prepare_model_for_int8_training,
    shift_tokens_right,
    transpose,
    _get_submodules,
    _set_adapter,
    _freeze_adapter,
    ModulesToSaveWrapper,
 )
 from .save_and_load import get_peft_model_state_dict, set_peft_model_state_dict
--- a/src/peft_library/utils/config.py
+++ b/src/peft_library/utils/config.py
@ -0,0 +1,176 @@
 # coding=utf-8
 # Copyright 2023-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import enum
 import json
 import os
 from dataclasses import asdict, dataclass, field
 from typing import Optional, Union
 from huggingface_hub import hf_hub_download
 from transformers.utils import PushToHubMixin
 from .other import CONFIG_NAME
 class PeftType(str, enum.Enum):
    PROMPT_TUNING = "PROMPT_TUNING"
    P_TUNING = "P_TUNING"
    PREFIX_TUNING = "PREFIX_TUNING"
    LORA = "LORA"
    ADALORA = "ADALORA"
    ADAPTION_PROMPT = "ADAPTION_PROMPT"
    MMOELORAS = "MMOELORAS"
 class TaskType(str, enum.Enum):
    SEQ_CLS = "SEQ_CLS"
    SEQ_2_SEQ_LM = "SEQ_2_SEQ_LM"
    CAUSAL_LM = "CAUSAL_LM"
    TOKEN_CLS = "TOKEN_CLS"
    CAUSAL_LMS = "CAUSAL_LMS"
@dataclass
 class PeftConfigMixin(PushToHubMixin):
    r"""
    This is the base configuration class for PEFT adapter models. It contains all the methods that are common to all
    PEFT adapter models. This class inherits from [`~transformers.utils.PushToHubMixin`] which contains the methods to
    push your model to the Hub. The method `save_pretrained` will save the configuration of your adapter model in a
    directory. The method `from_pretrained` will load the configuration of your adapter model from a directory.
    Args:
        peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use.
    """
    peft_type: Optional[PeftType] = field(default=None, metadata={"help": "The type of PEFT model."})
    @property
    def __dict__(self):
        return asdict(self)
    def to_dict(self):
        return self.__dict__
    def save_pretrained(self, save_directory, **kwargs):
        r"""
        This method saves the configuration of your adapter model in a directory.
        Args:
            save_directory (`str`):
                The directory where the configuration will be saved.
            kwargs (additional keyword arguments, *optional*):
                Additional keyword arguments passed along to the [`~transformers.utils.PushToHubMixin.push_to_hub`]
                method.
        """
        if os.path.isfile(save_directory):
            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
        os.makedirs(save_directory, exist_ok=True)
        output_dict = self.__dict__
        output_path = os.path.join(save_directory, CONFIG_NAME)
        # save it
        with open(output_path, "w") as writer:
            writer.write(json.dumps(output_dict, indent=2, sort_keys=True))
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, subfolder=None, **kwargs):
        r"""
        This method loads the configuration of your adapter model from a directory.
        Args:
            pretrained_model_name_or_path (`str`):
                The directory or the Hub repository id where the configuration is saved.
            kwargs (additional keyword arguments, *optional*):
                Additional keyword arguments passed along to the child class initialization.
        """
        path = (
            os.path.join(pretrained_model_name_or_path, subfolder)
            if subfolder is not None
            else pretrained_model_name_or_path
        )
        if os.path.isfile(os.path.join(path, CONFIG_NAME)):
            config_file = os.path.join(path, CONFIG_NAME)
        else:
            try:
                config_file = hf_hub_download(pretrained_model_name_or_path, CONFIG_NAME, subfolder=subfolder)
            except Exception:
                raise ValueError(f"Can't find '{CONFIG_NAME}' at '{pretrained_model_name_or_path}'")
        loaded_attributes = cls.from_json_file(config_file)
        config = cls(**kwargs)
        for key, value in loaded_attributes.items():
            if hasattr(config, key):
                setattr(config, key, value)
        return config
    @classmethod
    def from_json_file(cls, path_json_file, **kwargs):
        r"""
        Loads a configuration file from a json file.
        Args:
            path_json_file (`str`):
                The path to the json file.
        """
        with open(path_json_file, "r") as file:
            json_object = json.load(file)
        return json_object
@dataclass
 class PeftConfig(PeftConfigMixin):
    """
    This is the base configuration class to store the configuration of a [`PeftModel`].
    Args:
        peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use.
        task_type (Union[[`~peft.utils.config.TaskType`], `str`]): The type of task to perform.
        inference_mode (`bool`, defaults to `False`): Whether to use the Peft model in inference mode.
    """
    base_model_name_or_path: str = field(default=None, metadata={"help": "The name of the base model to use."})
    peft_type: Union[str, PeftType] = field(default=None, metadata={"help": "Peft type"})
    task_type: Union[str, TaskType] = field(default=None, metadata={"help": "Task type"})
    inference_mode: bool = field(default=False, metadata={"help": "Whether to use inference mode"})
@dataclass
 class PromptLearningConfig(PeftConfig):
    """
    This is the base configuration class to store the configuration of [`PrefixTuning`], [`PromptEncoder`], or
    [`PromptTuning`].
    Args:
        num_virtual_tokens (`int`): The number of virtual tokens to use.
        token_dim (`int`): The hidden embedding dimension of the base transformer model.
        num_transformer_submodules (`int`): The number of transformer submodules in the base transformer model.
        num_attention_heads (`int`): The number of attention heads in the base transformer model.
        num_layers (`int`): The number of layers in the base transformer model.
    """
    num_virtual_tokens: int = field(default=None, metadata={"help": "Number of virtual tokens"})
    token_dim: int = field(
        default=None, metadata={"help": "The hidden embedding dimension of the base transformer model"}
    )
    num_transformer_submodules: Optional[int] = field(
        default=None, metadata={"help": "Number of transformer submodules"}
    )
    num_attention_heads: Optional[int] = field(default=None, metadata={"help": "Number of attention heads"})
    num_layers: Optional[int] = field(default=None, metadata={"help": "Number of transformer layers"})
--- a/src/peft_library/utils/constants.py
+++ b/src/peft_library/utils/constants.py
@ -0,0 +1,4 @@
 from peft.utils.constants import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
 TRANSFORMERS_MODELS_TO_MMOELORAS_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
 TRANSFORMERS_MODELS_TO_MMOELORA_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
--- a/src/peft_library/utils/other.py
+++ b/src/peft_library/utils/other.py
@ -0,0 +1,250 @@
 # coding=utf-8
 # Copyright 2023-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
 import torch
 # needed for prefix-tuning of bloom model
 def bloom_model_postprocess_past_key_value(past_key_values):
    past_key_values = torch.cat(past_key_values)
    total_layers, batch_size, num_attention_heads, num_virtual_tokens, head_dim = past_key_values.shape
    keys = past_key_values[: total_layers // 2]
    keys = keys.transpose(2, 3).reshape(
        total_layers // 2, batch_size * num_attention_heads, head_dim, num_virtual_tokens
    )
    values = past_key_values[total_layers // 2 :]
    values = values.reshape(total_layers // 2, batch_size * num_attention_heads, num_virtual_tokens, head_dim)
    return tuple(zip(keys, values))
 def prepare_model_for_int8_training(
    model, output_embedding_layer_name="lm_head", use_gradient_checkpointing=True, layer_norm_names=["layer_norm"]
 ):
    r"""
    This method wraps the entire protocol for preparing a model before running a training. This includes:
        1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm
        head to fp32
    Args:
        model, (`transformers.PreTrainedModel`):
            The loaded model from `transformers`
    """
    loaded_in_8bit = getattr(model, "is_loaded_in_8bit", False)
    for name, param in model.named_parameters():
        # freeze base model's layers
        param.requires_grad = False
        if loaded_in_8bit:
            # cast layer norm in fp32 for stability for 8bit models
            if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names):
                param.data = param.data.to(torch.float32)
    if loaded_in_8bit and use_gradient_checkpointing:
        # For backward compatibility
        if hasattr(model, "enable_input_require_grads"):
            model.enable_input_require_grads()
        else:
            def make_inputs_require_grad(module, input, output):
                output.requires_grad_(True)
            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
        # enable gradient checkpointing for memory efficiency
        model.gradient_checkpointing_enable()
    if hasattr(model, output_embedding_layer_name):
        output_embedding_layer = getattr(model, output_embedding_layer_name)
        input_dtype = output_embedding_layer.weight.dtype
        class CastOutputToFloat(torch.nn.Sequential):
            r"""
            Manually cast to the expected dtype of the lm_head as sometimes there is a final layer norm that is casted
            in fp32
            """
            def forward(self, x):
                return super().forward(x.to(input_dtype)).to(torch.float32)
        setattr(model, output_embedding_layer_name, CastOutputToFloat(output_embedding_layer))
    return model
 # copied from transformers.models.bart.modeling_bart
 def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input ids
        pad_token_id (`int`): The id of the `padding` token.
        decoder_start_token_id (`int`): The id of the `start` token.
    """
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    shifted_input_ids[:, 0] = decoder_start_token_id
    if pad_token_id is None:
        raise ValueError("self.model.config.pad_token_id has to be defined.")
    # replace possible -100 values in labels by `pad_token_id`
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
    return shifted_input_ids
 class ModulesToSaveWrapper(torch.nn.Module):
    def __init__(self, module_to_save, adapter_name):
        super().__init__()
        self.original_module = module_to_save
        self.modules_to_save = torch.nn.ModuleDict({})
        self.update(adapter_name)
        self.active_adapter = adapter_name
    def update(self, adapter_name):
        self.modules_to_save.update(torch.nn.ModuleDict({adapter_name: copy.deepcopy(self.original_module)}))
    def forward(self, *args, **kwargs):
        if self.active_adapter not in self.modules_to_save:
            return self.original_module(*args, **kwargs)
        return self.modules_to_save[self.active_adapter](*args, **kwargs)
 def _get_submodules(model, key):
    parent = model.get_submodule(".".join(key.split(".")[:-1]))
    target_name = key.split(".")[-1]
    target = model.get_submodule(key)
    return parent, target, target_name
 def _freeze_adapter(model, adapter_name):
    for n, p in model.named_parameters():
        if adapter_name in n:
            p.requires_grad = False
 def _set_trainable(model, adapter_name):
    key_list = [key for key, _ in model.named_modules()]
    for key in key_list:
        target_module_found = any(key.endswith(target_key) for target_key in model.modules_to_save)
        if target_module_found:
            parent, target, target_name = _get_submodules(model, key)
            if isinstance(target, ModulesToSaveWrapper):
                target.update(adapter_name)
            else:
                for param in target.parameters():
                    param.requires_grad = True
                setattr(parent, target_name, ModulesToSaveWrapper(target, adapter_name))
 def _set_adapter(model, adapter_name):
    for module in model.modules():
        if isinstance(module, ModulesToSaveWrapper):
            module.active_adapter = adapter_name
 def fsdp_auto_wrap_policy(model):
    import functools
    import os
    from accelerate import FullyShardedDataParallelPlugin
    from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy
    from ..tuners import PrefixEncoder, PromptEmbedding, PromptEncoder
    def lambda_policy_fn(module):
        if (
            len(list(module.named_children())) == 0
            and getattr(module, "weight", None) is not None
            and module.weight.requires_grad
        ):
            return True
        return False
    lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn)
    transformer_wrap_policy = functools.partial(
        transformer_auto_wrap_policy,
        transformer_layer_cls=(
            PrefixEncoder,
            PromptEncoder,
            PromptEmbedding,
            FullyShardedDataParallelPlugin.get_module_class_from_name(
                model, os.environ.get("FSDP_TRANSFORMER_CLS_TO_WRAP", "")
            ),
        ),
    )
    auto_wrap_policy = functools.partial(_or_policy, policies=[lambda_policy, transformer_wrap_policy])
    return auto_wrap_policy
 def transpose(weight, fan_in_fan_out):
    return weight.T if fan_in_fan_out else weight
 TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING = {
    "t5": ["q", "v"],
    "mt5": ["q", "v"],
    "bart": ["q_proj", "v_proj"],
    "gpt2": ["c_attn"],
    "bloom": ["query_key_value"],
    "blip-2": ["q", "v", "q_proj", "v_proj"],
    "opt": ["q_proj", "v_proj"],
    "gptj": ["q_proj", "v_proj"],
    "gpt_neox": ["query_key_value"],
    "gpt_neo": ["q_proj", "v_proj"],
    "bert": ["query", "value"],
    "roberta": ["query", "value"],
    "xlm-roberta": ["query", "value"],
    "electra": ["query", "value"],
    "deberta-v2": ["query_proj", "value_proj"],
    "deberta": ["in_proj"],
    "layoutlm": ["query", "value"],
    "llama": ["q_proj", "v_proj"],
    "chatglm": ["query_key_value"],
 }
 TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING = {
    "t5": ["q", "k", "v", "o", "wi", "wo"],
    "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"],
    "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
    # "gpt2": ["c_attn"],
    # "bloom": ["query_key_value"],
    "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
    # "gptj": ["q_proj", "v_proj"],
    # "gpt_neox": ["query_key_value"],
    # "gpt_neo": ["q_proj", "v_proj"],
    # "bert": ["query", "value"],
    "roberta": ["query", "key", "value", "dense"],
    # "xlm-roberta": ["query", "value"],
    # "electra": ["query", "value"],
    "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"],
    # "deberta": ["in_proj"],
    # "layoutlm": ["query", "value"],
 }
 TRANSFORMERS_MODELS_TO_MMOELORAS_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
 TRANSFORMERS_MODELS_TO_MMOELORA_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
 TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING = {
    "bloom": bloom_model_postprocess_past_key_value,
 }
 WEIGHTS_NAME = "adapter_model.bin"
 CONFIG_NAME = "adapter_config.json"
--- a/src/peft_library/utils/peft_types.py
+++ b/src/peft_library/utils/peft_types.py
@ -0,0 +1,51 @@
 import enum
 class PeftType(str, enum.Enum):
    """
    Enum class for the different types of adapters in PEFT.
    Supported PEFT types:
    - PROMPT_TUNING
    - MULTITASK_PROMPT_TUNING
    - P_TUNING
    - PREFIX_TUNING
    - LORA
    - ADALORA
    - BOFT
    - ADAPTION_PROMPT
    - IA3
    - LOHA
    - LOKR
    - OFT
    - XLORA
    - POLY
    - LN_TUNING
    - VERA
    - FOURIERFT
    - HRA
    - BONE
    """
    PROMPT_TUNING = "PROMPT_TUNING"
    MULTITASK_PROMPT_TUNING = "MULTITASK_PROMPT_TUNING"
    P_TUNING = "P_TUNING"
    PREFIX_TUNING = "PREFIX_TUNING"
    LORA = "LORA"
    ADALORA = "ADALORA"
    BOFT = "BOFT"
    ADAPTION_PROMPT = "ADAPTION_PROMPT"
    IA3 = "IA3"
    LOHA = "LOHA"
    LOKR = "LOKR"
    OFT = "OFT"
    POLY = "POLY"
    LN_TUNING = "LN_TUNING"
    VERA = "VERA"
    FOURIERFT = "FOURIERFT"
    XLORA = "XLORA"
    HRA = "HRA"
    VBLORA = "VBLORA"
    CPT = "CPT"
    BONE = "BONE"
    MMOELORAS = "MMOELORAS"
    MMOELORA = "MMOELORA"
--- a/src/peft_library/utils/save_and_load.py
+++ b/src/peft_library/utils/save_and_load.py
@ -0,0 +1,130 @@
 # coding=utf-8
 # Copyright 2023-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .config import PeftType, PromptLearningConfig
 def get_peft_model_state_dict(model, state_dict=None, adapter_name="default"):
    """
    Get the state dict of the Peft model.
    Args:
        model ([`PeftModel`]): The Peft model. When using torch.nn.DistributedDataParallel, DeepSpeed or FSDP,
        the model should be the underlying model/unwrapped model (i.e. model.module).
        state_dict (`dict`, *optional*, defaults to `None`):
            The state dict of the model. If not provided, the state dict of the model
        will be used.
    """
    config = model.peft_config[adapter_name]
    if state_dict is None:
        state_dict = model.state_dict()
    if config.peft_type in (PeftType.LORA, PeftType.ADALORA,
                            PeftType.MMOELORAS):
        # to_return = lora_state_dict(model, bias=model.peft_config.bias)
        # adapted from `https://github.com/microsoft/LoRA/blob/main/loralib/utils.py`
        # to be used directly with the state dict which is necessary when using DeepSpeed or FSDP
        bias = config.bias
        if bias == "none":  # filter out all lora parameters
            to_return = {k: state_dict[k] for k in state_dict if "lora_" in k}
        elif bias == "all":
            to_return = {k: state_dict[k] for k in state_dict if "lora_" in k or "bias" in k}
        elif bias == "lora_only":
            to_return = {}
            for k in state_dict:
                if "lora_" in k:
                    to_return[k] = state_dict[k]
                    bias_name = k.split("lora_")[0] + "bias"
                    if bias_name in state_dict:
                        to_return[bias_name] = state_dict[bias_name]
        else:
            raise NotImplementedError
        to_return = {k: v for k, v in to_return.items() if (("lora_" in k and adapter_name in k) or ("bias" in k))}
        if config.peft_type == PeftType.ADALORA:
            rank_pattern = config.rank_pattern
            if rank_pattern is not None:
                rank_pattern = {k.replace(f".{adapter_name}", ""): v for k, v in rank_pattern.items()}
                config.rank_pattern = rank_pattern
                to_return = model.resize_state_dict_by_rank_pattern(rank_pattern, to_return, adapter_name)
    elif config.peft_type == PeftType.ADAPTION_PROMPT:
        to_return = {k: state_dict[k] for k in state_dict if k.split(".")[-1].startswith("adaption_")}
    elif isinstance(config, PromptLearningConfig):
        to_return = {}
        if config.inference_mode:
            prompt_embeddings = model.prompt_encoder[adapter_name].embedding.weight
        else:
            prompt_embeddings = model.get_prompt_embedding_to_save(adapter_name)
        to_return["prompt_embeddings"] = prompt_embeddings
    else:
        raise NotImplementedError
    if model.modules_to_save is not None:
        for key, value in state_dict.items():
            if any(f"{module_name}.modules_to_save.{adapter_name}" in key for module_name in model.modules_to_save):
                to_return[key.replace("modules_to_save.", "")] = value
    to_return = {k.replace(f".{adapter_name}", ""): v for k, v in to_return.items()}
    return to_return
 def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="default"):
    """
    Set the state dict of the Peft model.
    Args:
        model ([`PeftModel`]): The Peft model.
        peft_model_state_dict (`dict`): The state dict of the Peft model.
    """
    config = model.peft_config[adapter_name]
    state_dict = {}
    if model.modules_to_save is not None:
        for key, value in peft_model_state_dict.items():
            if any(module_name in key for module_name in model.modules_to_save):
                for module_name in model.modules_to_save:
                    if module_name in key:
                        key = key.replace(module_name, f"{module_name}.modules_to_save.{adapter_name}")
                        break
            state_dict[key] = value
    else:
        state_dict = peft_model_state_dict
    if config.peft_type in (PeftType.LORA, PeftType.ADALORA,
                            PeftType.MMOELORAS):
        peft_model_state_dict = {}
        for k, v in state_dict.items():
            if "lora_" in k:
                suffix = k.split("lora_")[1]
                if "." in suffix:
                    suffix_to_replace = ".".join(suffix.split(".")[1:])
                    k = k.replace(suffix_to_replace, f"{adapter_name}.{suffix_to_replace}")
                else:
                    k = f"{k}.{adapter_name}"
                peft_model_state_dict[k] = v
            else:
                peft_model_state_dict[k] = v
        if config.peft_type == PeftType.ADALORA:
            rank_pattern = config.rank_pattern
            if rank_pattern is not None:
                model.resize_modules_by_rank_pattern(rank_pattern, adapter_name)
    elif isinstance(config, PromptLearningConfig) or config.peft_type == PeftType.ADAPTION_PROMPT:
        peft_model_state_dict = state_dict
    else:
        raise NotImplementedError
    model.load_state_dict(peft_model_state_dict, strict=False)
    if isinstance(config, PromptLearningConfig):
        model.prompt_encoder[adapter_name].embedding.load_state_dict(
            {"weight": peft_model_state_dict["prompt_embeddings"]}, strict=True
        )
--- a/src/train.py
+++ b/src/train.py
@ -1,32 +1,46 @@
 import torch
-from datasets_library.factory import get_dataset
+from dataset_library.factory import get_dataset
 from transformers import AutoModelForVision2Seq, AutoProcessor, TrainingArguments
 from trl import (
    ModelConfig,
    TrlParser,
    get_kbit_device_map,
-    get_peft_config,
+    # get_peft_config,
    get_quantization_config,
 )
-from peft import get_peft_model
+from peft_library import get_peft_model, get_peft_config
 from utils.trainer import ContinualTrainer
-from utils.args import ContinualScriptArguments
+from utils.args import ContinualScriptArguments, ContinualModelConfig
 if __name__ == "__main__":
-    parser = TrlParser((ContinualScriptArguments, TrainingArguments, ModelConfig))
+    parser = TrlParser(
        (ContinualScriptArguments, TrainingArguments, ContinualModelConfig)
    )
    script_args, training_args, model_args = parser.parse_args_and_config()
    # for type hint
    if 0 == 1:
        script_args = ContinualScriptArguments()
        training_args = TrainingArguments()
-        model_args = ModelConfig()
+        model_args = ContinualModelConfig()
    training_args.gradient_checkpointing_kwargs = dict(use_reentrant=False)
    training_args.remove_unused_columns = False
    training_args.dataset_kwargs = {"skip_prepare_dataset": True}
    # peft_config = get_peft_config(dict(**vars(model_args)))
    if model_args.peft_type == "MMOELora":
        from peft_library.tuners import MMOELoraConfig
        peft_config = MMOELoraConfig(target_modules=model_args.lora_target_modules)
    elif model_args.peft_type == "LORA":
        from peft.tuners.lora import LoraConfig
        peft_config = LoraConfig(target_modules=model_args.lora_target_modules)
    else:
        peft_config = None
    torch_dtype = (
        model_args.torch_dtype
        if model_args.torch_dtype in ["auto", None]
@ -62,9 +76,6 @@ if __name__ == "__main__":
        collate_fn_for_train = partial(collate_fn_for_train, processor=processor)
        collate_fn_for_evaluate = partial(collate_fn_for_evaluate, processor=processor)
    peft_config = get_peft_config(model_args)
    model = get_peft_model(model, peft_config)
    ################
    # Dataset
    ################
@ -73,8 +84,10 @@ if __name__ == "__main__":
    accelerator = create_accelerator_and_postprocess(training_args)
-    if accelerator.is_local_main_process:
+    if peft_config is not None:
-        model.print_trainable_parameters()
+        model = get_peft_model(model, peft_config)
        if accelerator.is_local_main_process:
            model.print_trainable_parameters()
    for dataset_name in script_args.dataset_name:
        dataset = get_dataset(dataset_name)
--- a/src/train.sh
+++ b/src/train.sh
@ -1,8 +1,9 @@
 #!/bin/bash
-accelerate launch --config_file accelerate_configs/deepspeed_zero2.yaml train.py \
+accelerate launch --config_file configs/accelerate_configs/deepspeed_zero2.yaml train.py \
    --dataset_name OCR_VQA_200K OCR_VQA_200K OCR_VQA_200K \
    --use_peft \
    --peft_type LORA \
    --model_name_or_path Qwen/Qwen2-VL-7B-Instruct \
    --lora_target_modules q_proj v_proj \
    --per_device_train_batch_size 1 \
--- a/src/utils/args.py
+++ b/src/utils/args.py
@ -1,17 +1,21 @@
 from dataclasses import dataclass, field
 from typing import Optional
 from trl import ScriptArguments, ModelConfig
 from transformers import TrainingArguments
@dataclass
-class ContinualScriptArguments:
+class ContinualScriptArguments(ScriptArguments):
    """Script arguments for continual learning."""
    dataset_name: list[str] = field(
        default_factory=lambda: ["cifar10", "cifar100", "imagenet2012"]
    )
    dataset_config: Optional[str] = None
    dataset_train_split: str = "train"
    dataset_test_split: str = "test"
    dataset_generation_split: str = "generation"
-    gradient_checkpointing_use_reentrant: bool = False
+
-    ignore_bias_buffers: bool = False
+
@dataclass
 class ContinualModelConfig(ModelConfig):
    """Model configuration for continual learning."""
    peft_type: Optional[str] = None
		`@ -0,0 +1 @@`
							`from .mapping import get_peft_config, get_peft_model`
		`@ -0,0 +1,2 @@`
							`from .mmoelora.mmoelora import MMOELoraModel, MMOELoraConfig`
							`from .mmoelora.mmoeloraS import MMOELoraModelS, MMOELoraConfigS`