feat✨: 更新训练脚本以支持MOELORA，调整梯度累积步数，优化配置文件

2025-05-30 16:33:36 +08:00 · 2025-05-30 16:33:36 +08:00 · baccca420a
commit baccca420a
parent 70c446e548
8 changed files with 50 additions and 7 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -13,7 +13,7 @@
    // ],
    "python.analysis.languageServerMode": "default",
    "python.analysis.typeCheckingMode": "basic",
-    "python.analysis.userFileIndexingLimit": -1,
+    "python.analysis.userFileIndexingLimit": 10000,
    "python.analysis.usePullDiagnostics": false,
    "python.analysis.importFormat": "relative"
 }
--- a/src/configs/accelerate_configs/deepspeed_zero1.yaml
+++ b/src/configs/accelerate_configs/deepspeed_zero1.yaml
@ -2,7 +2,7 @@ compute_environment: LOCAL_MACHINE
 debug: false
 deepspeed_config:
  deepspeed_multinode_launcher: standard
-  gradient_accumulation_steps: 4
+  gradient_accumulation_steps: 2
  zero3_init_flag: false
  zero_stage: 1
 distributed_type: DEEPSPEED
--- a/src/peft_repo
+++ b/src/peft_repo
@ -1 +1 @@
-Subproject commit 83111347f3df66f04bd6759b1a3dcce719380628
+Subproject commit 317d957cc101c4cb064066a1b228526a55f6e927
--- a/src/scripts/train_omni.sh
+++ b/src/scripts/train_omni.sh
@ -0,0 +1,24 @@
 #!/bin/bash
 accelerate launch --config_file configs/accelerate_configs/deepspeed_zero1.yaml train.py \
    --dataset_name textvqa \
    --use_peft \
    --peft_type MOELORA \
    --model_name_or_path Qwen/Qwen2.5-Omni-3B \
    --lora_target_modules .*model\.layers.*proj \
    --lora_r 8 \
    --lora_alpha 32 \
    --per_device_train_batch_size 3 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 2 \
    --num_train_epochs 1 \
    --output_dir checkpoint/qwen2_alllinear/ \
    --learning_rate 2e-4 \
    --warmup_ratio 0.03 \
    --lr_scheduler_type cosine \
    --bf16 \
    --torch_dtype bfloat16 \
    --logging_steps 10 \
    --gradient_checkpointing \
    --weight_decay 0.1 \
    --resume_from_checkpoint /root/autodl-tmp/zhouyunyao/projects/CL-LMM/src/checkpoint/qwen2_alllinear/checkpoint-1000
--- a/src/todo.md
+++ b/src/todo.md
@ -27,4 +27,16 @@
 [2025.05.16]
- [ ] 处理不同的持续学习框架，使得整体框架能够兼容
+- [ ] 处理不同的持续学习框架，使得整体框架能够兼容
 [2025.05.28]
 - [x] MoeLora
 - [ ] Coin Benchmark
 - [x] 确定保存什么，便于后期测试
 - [ ] Olora
 - [ ] Hide-Llava
 [2025.05.30]
 - [ ] 评价指标
--- a/src/train.py
+++ b/src/train.py
@ -65,6 +65,13 @@ if __name__ == "__main__":
        model.add_adapter(peft_config)
    elif model_args.peft_type == "MOELORA":
        from peft.tuners import MOELoraConfig
        peft_config = MOELoraConfig(target_modules=model_args.lora_target_modules)
        model.add_adapter(peft_config)
    elif model_args.peft_type == "LORA":
        from peft.tuners.lora import LoraConfig
--- a/src/train.sh
+++ b/src/train.sh
@ -3,14 +3,14 @@
 accelerate launch --config_file configs/accelerate_configs/deepspeed_zero1.yaml train.py \
    --dataset_name textvqa \
    --use_peft \
-    --peft_type LORA \
+    --peft_type MOELORA \
    --model_name_or_path Qwen/Qwen2.5-Omni-3B \
    --lora_target_modules .\*proj.\*\|.\*fc.\*\|.\*mlp\.0\|.\*mlp\.2 \
    --lora_r 8 \
    --lora_alpha 32 \
    --per_device_train_batch_size 3 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 4 \
    --per_device_eval_batch_size 1 \
    --num_train_epochs 1 \
    --output_dir checkpoint/qwen2_alllinear/ \
    --learning_rate 5e-5 \
--- a/src/transformers_repo
+++ b/src/transformers_repo
@ -1 +1 @@
-Subproject commit c8a4ee5b9daf9865b372a483fd04a984f0b265dc
+Subproject commit 42a8639e1e827d6f0ab07d87078ff048b20dab19
		`@ -1 +1 @@`
			`Subproject commit 83111347f3df66f04bd6759b1a3dcce719380628`				`Subproject commit 317d957cc101c4cb064066a1b228526a55f6e927`
		`@ -1 +1 @@`
			`Subproject commit c8a4ee5b9daf9865b372a483fd04a984f0b265dc`				`Subproject commit 42a8639e1e827d6f0ab07d87078ff048b20dab19`