Skip to content

Commit f719f14

Browse files
authored
add glm4.5 yaml (#3129)
1 parent 8bf65f1 commit f719f14

File tree

2 files changed

+124
-0
lines changed

2 files changed

+124
-0
lines changed
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
### data
2+
train_dataset_type: erniekit
3+
eval_dataset_type: erniekit
4+
train_dataset_path: ./data/pt/train.jsonl
5+
train_dataset_prob: '1.0'
6+
eval_dataset_path: ./data/pt/eval.jsonl
7+
eval_dataset_prob: '1.0'
8+
max_seq_len: 8192
9+
mix_strategy: concat
10+
11+
### model
12+
model_name_or_path: /root/paddlejob/gpfs/efficient_benchmark/huggingface/GLM-4.5-Air
13+
attn_impl: flashmask
14+
15+
### finetuning
16+
# base
17+
stage: PT
18+
fine_tuning: full
19+
seed: 23
20+
do_train: true
21+
do_eval: true
22+
per_device_eval_batch_size: 1
23+
per_device_train_batch_size: 1
24+
num_train_epochs: 1
25+
max_steps: 30
26+
eval_iters: 10
27+
eval_steps: 100
28+
evaluation_strategy: steps
29+
save_steps: 100
30+
save_strategy: steps
31+
logging_steps: 1
32+
gradient_accumulation_steps: 8
33+
logging_dir: ./vdl_log
34+
output_dir: /root/paddlejob/tmpspace/GLM-4.5-Air-PT/outputs/checkpoint
35+
disable_tqdm: true
36+
eval_accumulation_steps: 16
37+
38+
# train
39+
warmup_steps: 20
40+
learning_rate: 1.0e-05
41+
random_shuffle: false
42+
43+
# performance
44+
tensor_parallel_degree: 4
45+
pipeline_parallel_degree: 4
46+
use_expert_parallel: true
47+
expert_parallel_degree: 8
48+
sequence_parallel: true
49+
sharding_parallel_config: split_param
50+
amp_master_grad: true
51+
sharding: stage1
52+
recompute: true
53+
bf16: true
54+
fp16_opt_level: O2
55+
unified_checkpoint: false
56+
load_via_cpu: true
57+
save_checkpoint_format: flex_checkpoint
58+
load_checkpoint_format: flex_checkpoint
59+
save_to_hf: true
60+
61+
benchmark: true
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
### data
2+
train_dataset_type: erniekit
3+
eval_dataset_type: erniekit
4+
train_dataset_path: ./data/sft/train.jsonl
5+
train_dataset_prob: "1.0"
6+
eval_dataset_path: ./data/sft/eval.jsonl
7+
eval_dataset_prob: "1.0"
8+
max_seq_len: 8192
9+
mix_strategy: concat
10+
11+
### model
12+
model_name_or_path: /root/paddlejob/gpfs/efficient_benchmark/huggingface/GLM-4.5-Air
13+
attn_impl: flashmask
14+
15+
### finetuning
16+
# base
17+
stage: SFT
18+
fine_tuning: full
19+
seed: 23
20+
do_train: true
21+
do_eval: true
22+
per_device_eval_batch_size: 1
23+
per_device_train_batch_size: 1
24+
num_train_epochs: 1
25+
max_steps: 100
26+
eval_iters: 10
27+
eval_steps: 100
28+
evaluation_strategy: steps
29+
save_steps: 100
30+
save_strategy: steps
31+
logging_steps: 1
32+
gradient_accumulation_steps: 8
33+
logging_dir: ./vdl_log
34+
output_dir: ./checkpoints/qwen3_hf_0p6b_sft_ckpts
35+
disable_tqdm: true
36+
eval_accumulation_steps: 16
37+
38+
# train
39+
warmup_steps: 20
40+
learning_rate: 1.0e-5
41+
random_shuffle: false
42+
43+
# performance
44+
tensor_parallel_degree: 4
45+
pipeline_parallel_degree: 4
46+
use_expert_parallel: true
47+
expert_parallel_degree: 8
48+
sequence_parallel: true
49+
sharding_parallel_config: "split_param"
50+
amp_master_grad: true
51+
#sharding_parallel_degree: 8
52+
sharding: stage1
53+
recompute: true
54+
bf16: true
55+
fp16_opt_level: O2
56+
# unified_checkpoint: true
57+
load_via_cpu: true
58+
# resume_from_checkpoint: false
59+
save_checkpoint_format : "flex_checkpoint"
60+
load_checkpoint_format : "flex_checkpoint"
61+
save_to_hf: true
62+
63+
benchmark: true

0 commit comments

Comments
 (0)