feat✨: 更新Python版本和依赖管理，优化训练脚本以支持Flash Attention 2

2025-01-11 03:53:23 +08:00 · 2025-01-11 03:53:23 +08:00 · 90b3181f3f
commit 90b3181f3f
parent b766c21c9b
8 changed files with 1807 additions and 16 deletions
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
+3.11.7
--- a/install.sh
+++ b/install.sh
@ -1,3 +1,3 @@
 #!/bin/bash
-uv venv --python 3.11.7
-pip install -U torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --extra-index-url https://download.pytorch.org/whl/cu124
+uv sync
+uv sync --extra compile
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,46 @@
+[project]
+dependencies = [
+  "accelerate==1.2.1",
+  "datasets==3.2.0",
+  "deepspeed==0.16.2",
+  "evaluate==0.4.3",
+  "markupsafe==2.1.5",
+  "peft==0.14.0",
+  "pip==24.3.1",
+  "requests==2.32.3",
+  "setuptools>=70.0.0",
+  "torch==2.5.1+cu124",
+  "torchaudio==2.5.1+cu124",
+  "torchvision==0.20.1+cu124",
+  "transformers==4.46.1",
+  "trl==0.13.0",
+  "wheel>=0.45.1",
+]
+description = "Add your description here"
+name = "cl-lmm"
+readme = "README.md"
+requires-python = ">=3.11"
+version = "0.1.0"
+
+[project.optional-dependencies]
+compile = [
+  "flash-attn>=2.7.2.post1",
+]
+
+[tool.uv.sources]
+markupsafe = {index = "pytorch"}
+requests = {index = "pypi"}
+torch = {index = "pytorch"}
+torchaudio = {index = "pytorch"}
+torchvision = {index = "pytorch"}
+
+[[tool.uv.index]]
+name = "pypi"
+url = "https://pypi.org/simple"
+
+[tool.uv]
+no-build-isolation-package = ["flash-attn"]
+
+[[tool.uv.index]]
+name = "pytorch"
+url = "https://download.pytorch.org/whl/cu124"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,11 +0,0 @@
-accelerate==1.2.1
-deepspeed==0.16.2
-evaluate==0.4.3
-peft==0.14.0
-pillow==10.2.0
-torch==2.5.1+cu124
-torchaudio==2.5.1+cu124
-torchvision==0.20.1+cu124
-transformers==4.46.1
-trl==0.13.0
-pillow==9.5.0
--- a/src/model_library/qwen2vl/model.py
+++ b/src/model_library/qwen2vl/model.py
@ -1,5 +1,12 @@
 # from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLCausalLMOutputWithPast, Qwen2VLModel, Qwen2VLForConditionalGeneration, logger, DynamicCache, Qwen2VLDecoderLayer, Qwen2VLConfig, Qwen2VLAttention,
 from transformers.models.qwen2_vl.modeling_qwen2_vl import *
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+else:
+    flash_attn_varlen_func = None
 from transformers.cache_utils import DynamicCache
 from transformers.modeling_outputs import BaseModelOutputWithPast
 import torch
@ -246,9 +253,145 @@ class Qwen2VLSdpaAttention_modified(Qwen2VLAttention_modified):
        return attn_output, None, past_key_value


+class Qwen2VLFlashAttention2_modified(Qwen2VLAttention_modified):
+    """
+    Qwen2VL flash attention module, following Qwen2VL attention module. This module inherits from `Qwen2VLAttention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[
+            Tuple[torch.Tensor, torch.Tensor]
+        ] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ):
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states, **kwargs)
+        key_states = self.k_proj(hidden_states, **kwargs)
+        value_states = self.v_proj(hidden_states, **kwargs)
+
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        ).transpose(1, 2)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+
+        query_states, key_states = apply_multimodal_rotary_pos_emb(
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+        )
+
+        if past_key_value is not None:
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "cache_position": cache_position,
+            }  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if (
+            self.config.use_sliding_window
+            and getattr(self.config, "sliding_window", None) is not None
+            and self.layer_idx >= self.config.max_window_layers
+        ):
+            sliding_window = self.config.sliding_window
+        else:
+            sliding_window = None
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            sliding_window=sliding_window,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
 QWEN2_VL_ATTENTION_CLASSES = {
    "eager": Qwen2VLAttention,
-    "flash_attention_2": Qwen2VLFlashAttention2,
+    "flash_attention_2": Qwen2VLFlashAttention2_modified,
    "sdpa": Qwen2VLSdpaAttention_modified,
 }

--- a/src/train.py
+++ b/src/train.py
@ -62,6 +62,7 @@ if __name__ == "__main__":
            trust_remote_code=model_args.trust_remote_code,
            **model_kwargs,
        )
+        print(model)
        from model_library.qwen2vl import (
            collate_fn_for_train,
            collate_fn_for_evaluate,
@ -85,13 +86,16 @@ if __name__ == "__main__":
        peft_config = MMOELoraConfig(target_modules=model_args.lora_target_modules)

        model = get_peft_model(model, peft_config)
-        # model = inject_adapter_in_model(peft_config, model)
+
+        if accelerator.is_local_main_process:
+            model.print_trainable_parameters()
    elif model_args.peft_type == "LORA":
        from peft.tuners.lora import LoraConfig

        peft_config = LoraConfig(target_modules=model_args.lora_target_modules)

        model = get_peft_model(model, peft_config)
+
        if accelerator.is_local_main_process:
            model.print_trainable_parameters()

--- a/src/train.sh
+++ b/src/train.sh
@ -8,7 +8,8 @@ accelerate launch --config_file configs/accelerate_configs/deepspeed_zero2.yaml
    --lora_target_modules q_proj v_proj \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 2 \
-    --gradient_accumulation_steps 8 \
+    --attn_implementation flash_attention_2 \
+    --gradient_accumulation_steps 16 \
    --output_dir checkpoint/sft-llava-1.5-7b-hf \
    --bf16 \
    --torch_dtype bfloat16
--- a/uv.lock
+++ b/uv.lock