Skip to content

Commit d352200

Browse files
authored
align rope (#3101)
1 parent 1b29a6b commit d352200

File tree

20 files changed

+876
-356
lines changed

20 files changed

+876
-356
lines changed

paddleformers/transformers/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@
193193
"llama.configuration": [
194194
"LlamaConfig",
195195
],
196-
"llama.modeling": ["LlamaForCausalLM", "LlamaModel", "LlamaForCausalLMPipe"],
196+
"llama.modeling": ["LlamaForCausalLM", "LlamaModel", "LlamaForCausalLMPipe", "LlamaRotaryEmbedding"],
197197
"llama.tokenizer": ["LlamaTokenizer", "Llama3Tokenizer"],
198198
"llama.tokenizer_fast": ["LlamaTokenizerFast"],
199199
"optimization": [

paddleformers/transformers/deepseek_v3/configuration.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ def __init__(
199199
self.qk_rope_head_dim = qk_rope_head_dim
200200
self.v_head_dim = v_head_dim
201201
self.qk_nope_head_dim = qk_nope_head_dim
202+
self.head_dim = qk_rope_head_dim
202203
self.topk_method = topk_method
203204
self.n_group = n_group
204205
self.topk_group = topk_group

paddleformers/transformers/deepseek_v3/modeling.py

Lines changed: 32 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
SequenceClassifierOutputWithPast,
6262
)
6363
from ..model_utils import PretrainedModel, register_base_model
64-
from ..modeling_rope_utils import dynamic_rope_update
64+
from ..modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
6565
from ..moe_gate import PretrainedMoEGate
6666
from ..moe_layer import MoEFlexTokenLayer
6767
from .configuration import DeepseekV3Config
@@ -137,81 +137,6 @@ def yarn_get_mscale(scale, mscale=1):
137137
return 0.1 * mscale * math.log(scale) + 1.0
138138

139139

140-
def _compute_yarn_parameters(
141-
config,
142-
seq_len=None,
143-
):
144-
base = config["rope_theta"]
145-
rope_parameters_dict = config["rope_parameters"]
146-
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
147-
head_dim = getattr(config, "qk_rope_head_dim", config.hidden_size // config.num_attention_heads)
148-
dim = int(head_dim * partial_rotary_factor)
149-
150-
factor = rope_parameters_dict["factor"]
151-
attention_factor = rope_parameters_dict.get("attention_factor", None)
152-
mscale = rope_parameters_dict.get("mscale")
153-
mscale_all_dim = rope_parameters_dict.get("mscale_all_dim")
154-
155-
# NOTE: DeekSeek-V3 (and potentially other models) modify `max_position_embeddings` and have a
156-
# `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
157-
# values to compute the default attention scaling factor, instead of using `factor`.
158-
if "original_max_position_embeddings" in rope_parameters_dict:
159-
original_max_position_embeddings = rope_parameters_dict["original_max_position_embeddings"]
160-
factor = config.max_position_embeddings / original_max_position_embeddings
161-
else:
162-
original_max_position_embeddings = config.max_position_embeddings
163-
164-
# Sets the attention factor as suggested in the paper
165-
if attention_factor is None:
166-
if mscale and mscale_all_dim:
167-
attention_factor = float(yarn_get_mscale(factor, mscale) / yarn_get_mscale(factor, mscale_all_dim))
168-
else:
169-
attention_factor = yarn_get_mscale(factor)
170-
171-
# Optional config options
172-
# beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
173-
beta_fast = rope_parameters_dict.get("beta_fast") or 32
174-
beta_slow = rope_parameters_dict.get("beta_slow") or 1
175-
176-
# Compute the inverse frequencies
177-
def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
178-
"""Inverse dimension formula to find the dimension based on the number of rotations"""
179-
return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
180-
181-
def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings, truncate):
182-
"""Find dimension range bounds based on rotations"""
183-
low = find_correction_dim(low_rot, dim, base, max_position_embeddings)
184-
high = find_correction_dim(high_rot, dim, base, max_position_embeddings)
185-
if truncate:
186-
low = math.floor(low)
187-
high = math.ceil(high)
188-
return max(low, 0), min(high, dim - 1)
189-
190-
def linear_ramp_factor(min, max, dim):
191-
if min == max:
192-
max += 0.001 # Prevent singularity
193-
194-
linear_func = (paddle.arange(dim, dtype=paddle.float32) - min) / (max - min)
195-
ramp_func = paddle.clamp(linear_func, 0, 1)
196-
return ramp_func
197-
198-
pos_freqs = base ** (paddle.arange(0, dim, 2).astype(paddle.float32) / dim)
199-
inv_freq_extrapolation = 1.0 / pos_freqs
200-
inv_freq_interpolation = 1.0 / (factor * pos_freqs)
201-
202-
# truncate = config.rope_parameters.get("truncate", True)
203-
low, high = find_correction_range(beta_fast, beta_slow, dim, base, original_max_position_embeddings, True)
204-
205-
# Get n-dimensional rotational scaling corrected for extrapolation
206-
inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).astype(paddle.float32)
207-
208-
inv_freq = (
209-
inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
210-
+ inv_freq_extrapolation * inv_freq_extrapolation_factor
211-
)
212-
return inv_freq, attention_factor
213-
214-
215140
class DeepseekV3YarnRotaryEmbedding(nn.Layer):
216141
def __init__(self, config: DeepseekV3Config, device=None):
217142
super().__init__()
@@ -221,11 +146,38 @@ def __init__(self, config: DeepseekV3Config, device=None):
221146

222147
rope_parameters = self.config.rope_parameters
223148
self.rope_type = rope_parameters.get("rope_type", rope_parameters.get("type", "default"))
224-
assert self.rope_type == "yarn"
149+
rope_init_fn = self.compute_default_rope_parameters
150+
if self.rope_type != "default":
151+
rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
152+
inv_freq, self.attention_scaling = rope_init_fn(self.config)
153+
154+
self.register_buffer("inv_freq", inv_freq, persistable=False)
155+
self.original_inv_freq = inv_freq
156+
157+
@staticmethod
158+
def compute_default_rope_parameters(
159+
config: Optional[DeepseekV3Config] = None,
160+
seq_len: Optional[int] = None,
161+
) -> tuple["paddle.Tensor", float]:
162+
"""
163+
Computes the inverse frequencies according to the original RoPE implementation
164+
Args:
165+
config ([`PreTrainedConfig`]):
166+
The model configuration.
167+
seq_len (`int`, *optional*):
168+
The current sequence length. Unused for this type of RoPE.
169+
Returns:
170+
Tuple of (`paddle.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
171+
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
172+
"""
173+
base = config.rope_parameters["rope_theta"]
174+
dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
175+
176+
attention_factor = 1.0 # Unused in this type of RoPE
225177

226-
self.inv_freq, self.attention_scaling = _compute_yarn_parameters(config)
227-
self.register_buffer("inv_freq", self.inv_freq, persistable=False)
228-
# self.original_inv_freq = self.inv_freq
178+
# Compute the inverse frequencies
179+
inv_freq = 1.0 / (base ** (paddle.arange(0, dim, 2, dtype=paddle.int64).astype(dtype=paddle.float32) / dim))
180+
return inv_freq, attention_factor
229181

230182
@dynamic_rope_update
231183
def forward(self, x, position_ids):

paddleformers/transformers/ernie4_5/modeling.py

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
CausalLMOutputWithCrossAttentions,
4444
)
4545
from ..model_utils import PretrainedModel, register_base_model
46-
from ..modeling_rope_utils import dynamic_rope_update
46+
from ..modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
4747
from ..tensor_parallel_utils import model_parallel_dropout
4848
from .configuration import Ernie4_5Config
4949

@@ -121,6 +121,38 @@ def __init__(self, config):
121121
self.base = config.rope_theta
122122
rope_parameters = config.rope_parameters
123123
self.rope_type = rope_parameters.get("rope_type", rope_parameters.get("type", "default"))
124+
rope_init_fn = self.compute_default_rope_parameters
125+
if self.rope_type != "default":
126+
rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
127+
inv_freq, self.attention_scaling = rope_init_fn(self.config)
128+
129+
self.register_buffer("inv_freq", inv_freq, persistable=False)
130+
self.original_inv_freq = inv_freq
131+
132+
@staticmethod
133+
def compute_default_rope_parameters(
134+
config: Optional[Ernie4_5Config] = None,
135+
seq_len: Optional[int] = None,
136+
) -> tuple["paddle.Tensor", float]:
137+
"""
138+
Computes the inverse frequencies according to the original RoPE implementation
139+
Args:
140+
config ([`PreTrainedConfig`]):
141+
The model configuration.
142+
seq_len (`int`, *optional*):
143+
The current sequence length. Unused for this type of RoPE.
144+
Returns:
145+
Tuple of (`paddle.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
146+
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
147+
"""
148+
base = config.rope_parameters["rope_theta"]
149+
dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
150+
151+
attention_factor = 1.0 # Unused in this type of RoPE
152+
153+
# Compute the inverse frequencies
154+
inv_freq = 1.0 / (base ** (paddle.arange(0, dim, 2, dtype=paddle.int64).astype(dtype=paddle.float32) / dim))
155+
return inv_freq, attention_factor
124156

125157
@dynamic_rope_update
126158
def forward(self, x, position_ids):
@@ -134,18 +166,19 @@ def forward(self, x, position_ids):
134166
Returns:
135167
Tensor: Rotary position embeddings of shape [1, 1, seq_length, head_dim]
136168
"""
137-
indices = paddle.arange(0, self.head_dim, 2, dtype="float32")
138-
indices = 1 / self.base ** (indices / self.head_dim)
139-
140-
sinusoid_inp = position_ids.unsqueeze(-1).astype("float32") * indices.unsqueeze(
141-
0
142-
) # [b, s, 1] * [1, d/2] -> [b, s, d/2]
143-
emb = paddle.cat((sinusoid_inp, sinusoid_inp), axis=-1)
144-
cos = emb.cos()
145-
sin = emb.sin()
146-
147-
# keeping it in full precision
148-
return cos, sin
169+
with paddle.amp.auto_cast(enable=False):
170+
inv_freq_expanded = self.inv_freq[None, :, None].float().expand([position_ids.shape[0], -1, 1])
171+
172+
position_ids_expanded = position_ids[:, None, :].float()
173+
174+
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
175+
176+
emb = paddle.concat((freqs, freqs), axis=-1)
177+
178+
cos = emb.cos() * self.attention_scaling
179+
sin = emb.sin() * self.attention_scaling
180+
181+
return cos.astype(dtype=x.dtype), sin.astype(dtype=x.dtype)
149182

150183

151184
class Ernie4_5Attention(nn.Layer):

paddleformers/transformers/ernie4_5_moe/modeling.py

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
from ..masking_utils import create_causal_mask_and_row_indices
5252
from ..model_outputs import MoECausalLMOutputWithPast, MoECausalLMOutputWithPastAndMTP
5353
from ..model_utils import PretrainedModel, register_base_model
54-
from ..modeling_rope_utils import dynamic_rope_update
54+
from ..modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
5555
from ..tensor_parallel_utils import model_parallel_dropout
5656
from .configuration import Ernie4_5_MoeConfig
5757

@@ -99,6 +99,38 @@ def __init__(self, config):
9999
self.base = config.rope_theta
100100
rope_parameters = config.rope_parameters
101101
self.rope_type = rope_parameters.get("rope_type", rope_parameters.get("type", "default"))
102+
rope_init_fn = self.compute_default_rope_parameters
103+
if self.rope_type != "default":
104+
rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
105+
inv_freq, self.attention_scaling = rope_init_fn(self.config)
106+
107+
self.register_buffer("inv_freq", inv_freq, persistable=False)
108+
self.original_inv_freq = inv_freq
109+
110+
@staticmethod
111+
def compute_default_rope_parameters(
112+
config: Optional[Ernie4_5_MoeConfig] = None,
113+
seq_len: Optional[int] = None,
114+
) -> tuple["paddle.Tensor", float]:
115+
"""
116+
Computes the inverse frequencies according to the original RoPE implementation
117+
Args:
118+
config ([`PreTrainedConfig`]):
119+
The model configuration.
120+
seq_len (`int`, *optional*):
121+
The current sequence length. Unused for this type of RoPE.
122+
Returns:
123+
Tuple of (`paddle.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
124+
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
125+
"""
126+
base = config.rope_parameters["rope_theta"]
127+
dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
128+
129+
attention_factor = 1.0 # Unused in this type of RoPE
130+
131+
# Compute the inverse frequencies
132+
inv_freq = 1.0 / (base ** (paddle.arange(0, dim, 2, dtype=paddle.int64).astype(dtype=paddle.float32) / dim))
133+
return inv_freq, attention_factor
102134

103135
@dynamic_rope_update
104136
def forward(self, x, position_ids):
@@ -112,18 +144,19 @@ def forward(self, x, position_ids):
112144
Returns:
113145
Tensor: Rotary position embeddings of shape [1, 1, seq_length, head_dim]
114146
"""
115-
indices = paddle.arange(0, self.head_dim, 2, dtype="float32")
116-
indices = 1 / self.base ** (indices / self.head_dim)
117-
118-
sinusoid_inp = position_ids.unsqueeze(-1).astype("float32") * indices.unsqueeze(
119-
0
120-
) # [b, s, 1] * [1, d/2] -> [b, s, d/2]
121-
emb = paddle.cat((sinusoid_inp, sinusoid_inp), axis=-1)
122-
cos = emb.cos()
123-
sin = emb.sin()
124-
125-
# keeping it in full precision
126-
return cos, sin
147+
with paddle.amp.auto_cast(enable=False):
148+
inv_freq_expanded = self.inv_freq[None, :, None].float().expand([position_ids.shape[0], -1, 1])
149+
150+
position_ids_expanded = position_ids[:, None, :].float()
151+
152+
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
153+
154+
emb = paddle.concat((freqs, freqs), axis=-1)
155+
156+
cos = emb.cos() * self.attention_scaling
157+
sin = emb.sin() * self.attention_scaling
158+
159+
return cos.astype(dtype=x.dtype), sin.astype(dtype=x.dtype)
127160

128161

129162
class Ernie4_5_MoeMLP(Ernie4_5MLP):

paddleformers/transformers/gemma3_text/configuration.py

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -214,15 +214,7 @@ def __init__(
214214
self.rope_scaling = rope_scaling
215215

216216
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
217-
if rope_scaling is not None:
218-
if rope_parameters is None:
219-
rope_parameters = {"sliding_attention": {"rope_type": "default"}, "full_attention": rope_scaling}
220-
elif "full_attention" in rope_parameters:
221-
rope_parameters["full_attention"].update(rope_scaling)
222-
else:
223-
rope_parameters.update(rope_scaling)
224-
225-
self.rope_parameters = rope_parameters
217+
self.rope_parameters = rope_scaling or rope_parameters
226218

227219
self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6)
228220
if self.layer_types is None:
@@ -233,9 +225,7 @@ def __init__(
233225
layer_type_validation(self.layer_types, self.num_hidden_layers)
234226

235227
# Validate the correctness of rotary position embeddings parameters
236-
standardize_rope_params(
237-
self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq}
238-
)
228+
standardize_rope_params(self, rope_theta=rope_theta)
239229
rope_config_validation(self)
240230

241231

paddleformers/transformers/gemma3_text/modeling.py

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
)
3737
from ..model_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
3838
from ..model_utils import PretrainedModel
39-
from ..modeling_rope_utils import dynamic_rope_update
39+
from ..modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
4040
from .configuration import Gemma3Config, Gemma3TextConfig
4141

4242
try:
@@ -126,20 +126,43 @@ def __init__(self, config):
126126
self.max_seq_len_cached = config.max_position_embeddings
127127
self.original_max_seq_len = config.max_position_embeddings
128128
self.config = config
129-
base = config.rope_theta
130-
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
131-
head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
132129
rope_parameters = self.config.rope_parameters
133130
self.rope_type = rope_parameters.get("rope_type", rope_parameters.get("type", "default"))
134-
dim = int(head_dim * partial_rotary_factor)
135131

136132
# TODO: The rope_type here is the 'default', which supports some models such as `gemma-3-1b-it`.
137133
# Other models, such as `gemma-3-4b-it`, require other types, such as 'linear', which is not supported now.
138-
inv_freq = 1.0 / (base ** (paddle.arange(0, dim, 2, dtype=paddle.int64).astype(dtype=paddle.float32) / dim))
139-
self.attention_scaling = 1.0
134+
rope_init_fn = self.compute_default_rope_parameters
135+
if self.rope_type != "default":
136+
rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
137+
inv_freq, self.attention_scaling = rope_init_fn(self.config)
140138
self.register_buffer("inv_freq", inv_freq, persistable=False)
141139
self.original_inv_freq = self.inv_freq
142140

141+
@staticmethod
142+
def compute_default_rope_parameters(
143+
config: Optional[Gemma3TextConfig] = None,
144+
seq_len: Optional[int] = None,
145+
) -> tuple["paddle.Tensor", float]:
146+
"""
147+
Computes the inverse frequencies according to the original RoPE implementation
148+
Args:
149+
config ([`PreTrainedConfig`]):
150+
The model configuration.
151+
seq_len (`int`, *optional*):
152+
The current sequence length. Unused for this type of RoPE.
153+
Returns:
154+
Tuple of (`paddle.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
155+
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
156+
"""
157+
base = config.rope_parameters["rope_theta"]
158+
dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
159+
160+
attention_factor = 1.0 # Unused in this type of RoPE
161+
162+
# Compute the inverse frequencies
163+
inv_freq = 1.0 / (base ** (paddle.arange(0, dim, 2, dtype=paddle.int64).astype(dtype=paddle.float32) / dim))
164+
return inv_freq, attention_factor
165+
143166
@dynamic_rope_update
144167
def forward(self, x, position_ids):
145168
# NOTE: Paddle's Automatic Mixed Precision (AMP) has a default op whitelist that may automatically cast

0 commit comments

Comments
 (0)