add glm4.5 yaml (#3129)

risemeup1 · web-flow · commit f719f1452137 · 2025-12-10T18:05:15.000+08:00
diff --git a/benchmark/config/pt/GLM4.5-Air.yaml b/benchmark/config/pt/GLM4.5-Air.yaml
@@ -0,0 +1,61 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: ./data/pt/train.jsonl
+train_dataset_prob: '1.0'
+eval_dataset_path: ./data/pt/eval.jsonl
+eval_dataset_prob: '1.0'
+max_seq_len: 8192
+mix_strategy: concat
+
+### model
+model_name_or_path: /root/paddlejob/gpfs/efficient_benchmark/huggingface/GLM-4.5-Air
+attn_impl: flashmask
+
+### finetuning
+# base
+stage: PT
+fine_tuning: full
+seed: 23
+do_train: true
+do_eval: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 1
+max_steps: 30
+eval_iters: 10
+eval_steps: 100
+evaluation_strategy: steps
+save_steps: 100
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 8
+logging_dir: ./vdl_log
+output_dir: /root/paddlejob/tmpspace/GLM-4.5-Air-PT/outputs/checkpoint
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+# train
+warmup_steps: 20
+learning_rate: 1.0e-05
+random_shuffle: false
+
+# performance
+tensor_parallel_degree: 4
+pipeline_parallel_degree: 4
+use_expert_parallel: true
+expert_parallel_degree: 8
+sequence_parallel: true
+sharding_parallel_config: split_param
+amp_master_grad: true
+sharding: stage1
+recompute: true
+bf16: true
+fp16_opt_level: O2
+unified_checkpoint: false
+load_via_cpu: true
+save_checkpoint_format: flex_checkpoint
+load_checkpoint_format: flex_checkpoint
+save_to_hf: true
+
+benchmark: true
diff --git a/benchmark/config/sft/GLM4.5-Air.yaml b/benchmark/config/sft/GLM4.5-Air.yaml
@@ -0,0 +1,63 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: ./data/sft/train.jsonl
+train_dataset_prob: "1.0"
+eval_dataset_path: ./data/sft/eval.jsonl
+eval_dataset_prob: "1.0"
+max_seq_len: 8192
+mix_strategy: concat
+
+### model
+model_name_or_path: /root/paddlejob/gpfs/efficient_benchmark/huggingface/GLM-4.5-Air
+attn_impl: flashmask
+
+### finetuning
+# base
+stage: SFT
+fine_tuning: full
+seed: 23
+do_train: true
+do_eval: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 1
+max_steps: 100
+eval_iters: 10
+eval_steps: 100
+evaluation_strategy: steps
+save_steps: 100
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 8
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/qwen3_hf_0p6b_sft_ckpts
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+# train
+warmup_steps: 20
+learning_rate: 1.0e-5
+random_shuffle: false
+
+# performance
+tensor_parallel_degree: 4
+pipeline_parallel_degree: 4
+use_expert_parallel: true
+expert_parallel_degree: 8
+sequence_parallel: true
+sharding_parallel_config: "split_param"
+amp_master_grad: true
+#sharding_parallel_degree: 8
+sharding: stage1
+recompute: true
+bf16: true
+fp16_opt_level: O2
+# unified_checkpoint: true
+load_via_cpu: true
+# resume_from_checkpoint: false
+save_checkpoint_format : "flex_checkpoint"
+load_checkpoint_format : "flex_checkpoint"
+save_to_hf: true
+
+benchmark: true