@@ -64,13 +64,20 @@ def forward(self, time_steps: np.ndarray) -> np.ndarray:
6464# -------------------------------
6565class PositionwiseFeedForward :
6666 def __init__ (
67- self , d_model : int , hidden : int , drop_prob : float = 0.0 ,
68- seed : Optional [int ] = None
67+ self ,
68+ d_model : int ,
69+ hidden : int ,
70+ drop_prob : float = 0.0 ,
71+ seed : Optional [int ] = None ,
6972 ) -> None :
7073 self .rng = np .random .default_rng (seed )
71- self .w1 = self .rng .standard_normal ((d_model , hidden )) * math .sqrt (2.0 / (d_model + hidden ))
74+ self .w1 = self .rng .standard_normal ((d_model , hidden )) * math .sqrt (
75+ 2.0 / (d_model + hidden )
76+ )
7277 self .b1 = np .zeros ((hidden ,))
73- self .w2 = self .rng .standard_normal ((hidden , d_model )) * math .sqrt (2.0 / (hidden + d_model ))
78+ self .w2 = self .rng .standard_normal ((hidden , d_model )) * math .sqrt (
79+ 2.0 / (hidden + d_model )
80+ )
7481 self .b2 = np .zeros ((d_model ,))
7582
7683 def forward (self , input_tensor : np .ndarray ) -> np .ndarray :
@@ -96,8 +103,11 @@ def forward(self, input_tensor: np.ndarray) -> np.ndarray:
96103# -------------------------------
97104class ScaledDotProductAttention :
98105 def forward (
99- self , query : np .ndarray , key : np .ndarray , value : np .ndarray ,
100- mask : np .ndarray | None = None
106+ self ,
107+ query : np .ndarray ,
108+ key : np .ndarray ,
109+ value : np .ndarray ,
110+ mask : np .ndarray | None = None ,
101111 ) -> tuple [np .ndarray , np .ndarray ]:
102112 """
103113 Compute scaled dot-product attention.
@@ -134,31 +144,46 @@ def __init__(self, d_model: int, n_head: int, seed: Optional[int] = None) -> Non
134144 self .n_head = n_head
135145 self .d_k = d_model // n_head
136146
137- self .w_q = self .rng .standard_normal ((d_model , d_model )) * math .sqrt (2.0 / (d_model + d_model ))
147+ self .w_q = self .rng .standard_normal ((d_model , d_model )) * math .sqrt (
148+ 2.0 / (d_model + d_model )
149+ )
138150 self .b_q = np .zeros ((d_model ,))
139- self .w_k = self .rng .standard_normal ((d_model , d_model )) * math .sqrt (2.0 / (d_model + d_model ))
151+ self .w_k = self .rng .standard_normal ((d_model , d_model )) * math .sqrt (
152+ 2.0 / (d_model + d_model )
153+ )
140154 self .b_k = np .zeros ((d_model ,))
141- self .w_v = self .rng .standard_normal ((d_model , d_model )) * math .sqrt (2.0 / (d_model + d_model ))
155+ self .w_v = self .rng .standard_normal ((d_model , d_model )) * math .sqrt (
156+ 2.0 / (d_model + d_model )
157+ )
142158 self .b_v = np .zeros ((d_model ,))
143- self .w_out = self .rng .standard_normal ((d_model , d_model )) * math .sqrt (2.0 / (d_model + d_model ))
159+ self .w_out = self .rng .standard_normal ((d_model , d_model )) * math .sqrt (
160+ 2.0 / (d_model + d_model )
161+ )
144162 self .b_out = np .zeros ((d_model ,))
145163
146164 self .attn = ScaledDotProductAttention ()
147165
148- def _linear (self , x : np .ndarray , weight : np .ndarray , bias : np .ndarray ) -> np .ndarray :
166+ def _linear (
167+ self , x : np .ndarray , weight : np .ndarray , bias : np .ndarray
168+ ) -> np .ndarray :
149169 return np .tensordot (x , weight , axes = ([2 ], [0 ])) + bias
150170
151171 def _split_heads (self , x : np .ndarray ) -> np .ndarray :
152172 batch_size , seq_len , _ = x .shape
153- return x .reshape (batch_size , seq_len , self .n_head , self .d_k ).transpose (0 , 2 , 1 , 3 )
173+ return x .reshape (batch_size , seq_len , self .n_head , self .d_k ).transpose (
174+ 0 , 2 , 1 , 3
175+ )
154176
155177 def _concat_heads (self , x : np .ndarray ) -> np .ndarray :
156178 batch_size , n_head , seq_len , d_k = x .shape
157179 return x .transpose (0 , 2 , 1 , 3 ).reshape (batch_size , seq_len , n_head * d_k )
158180
159181 def forward (
160- self , query : np .ndarray , key : np .ndarray , value : np .ndarray ,
161- mask : np .ndarray | None = None
182+ self ,
183+ query : np .ndarray ,
184+ key : np .ndarray ,
185+ value : np .ndarray ,
186+ mask : np .ndarray | None = None ,
162187 ) -> tuple [np .ndarray , np .ndarray ]:
163188 """
164189 Parameters
@@ -184,6 +209,8 @@ def forward(
184209 concat = self ._concat_heads (context )
185210 out = np .tensordot (concat , self .w_out , axes = ([2 ], [0 ])) + self .b_out
186211 return out , attn_weights
212+
213+
187214# -------------------------------
188215# 🔹 LayerNorm
189216# -------------------------------
@@ -215,13 +242,17 @@ def forward(self, input_tensor: np.ndarray) -> np.ndarray:
215242# 🔹 TransformerEncoderLayer
216243# -------------------------------
217244class TransformerEncoderLayer :
218- def __init__ (self , d_model : int , n_head : int , hidden_dim : int , seed : Optional [int ] = None ) -> None :
245+ def __init__ (
246+ self , d_model : int , n_head : int , hidden_dim : int , seed : Optional [int ] = None
247+ ) -> None :
219248 self .self_attn = MultiHeadAttention (d_model , n_head , seed )
220249 self .ffn = PositionwiseFeedForward (d_model , hidden_dim , seed = seed )
221250 self .norm1 = LayerNorm (d_model )
222251 self .norm2 = LayerNorm (d_model )
223252
224- def forward (self , input_tensor : np .ndarray , mask : np .ndarray | None = None ) -> np .ndarray :
253+ def forward (
254+ self , input_tensor : np .ndarray , mask : np .ndarray | None = None
255+ ) -> np .ndarray :
225256 """
226257 Parameters
227258 ----------
@@ -235,7 +266,9 @@ def forward(self, input_tensor: np.ndarray, mask: np.ndarray | None = None) -> n
235266 np.ndarray
236267 Shape (batch, seq_len, d_model)
237268 """
238- attn_out , _ = self .self_attn .forward (input_tensor , input_tensor , input_tensor , mask )
269+ attn_out , _ = self .self_attn .forward (
270+ input_tensor , input_tensor , input_tensor , mask
271+ )
239272 x_norm1 = self .norm1 .forward (input_tensor + attn_out )
240273 ffn_out = self .ffn .forward (x_norm1 )
241274 x_norm2 = self .norm2 .forward (x_norm1 + ffn_out )
@@ -246,10 +279,22 @@ def forward(self, input_tensor: np.ndarray, mask: np.ndarray | None = None) -> n
246279# 🔹 TransformerEncoder (stack)
247280# -------------------------------
248281class TransformerEncoder :
249- def __init__ (self , d_model : int , n_head : int , hidden_dim : int , num_layers : int , seed : Optional [int ] = None ) -> None :
250- self .layers = [TransformerEncoderLayer (d_model , n_head , hidden_dim , seed ) for _ in range (num_layers )]
282+ def __init__ (
283+ self ,
284+ d_model : int ,
285+ n_head : int ,
286+ hidden_dim : int ,
287+ num_layers : int ,
288+ seed : Optional [int ] = None ,
289+ ) -> None :
290+ self .layers = [
291+ TransformerEncoderLayer (d_model , n_head , hidden_dim , seed )
292+ for _ in range (num_layers )
293+ ]
251294
252- def forward (self , input_tensor : np .ndarray , mask : np .ndarray | None = None ) -> np .ndarray :
295+ def forward (
296+ self , input_tensor : np .ndarray , mask : np .ndarray | None = None
297+ ) -> np .ndarray :
253298 """
254299 Parameters
255300 ----------
@@ -278,7 +323,9 @@ def __init__(self, d_model: int, seed: Optional[int] = None) -> None:
278323 self .w = self .rng .standard_normal ((d_model ,)) * math .sqrt (2.0 / d_model )
279324 self .b = 0.0
280325
281- def forward (self , input_tensor : np .ndarray , mask : np .ndarray | None = None ) -> tuple [np .ndarray , np .ndarray ]:
326+ def forward (
327+ self , input_tensor : np .ndarray , mask : np .ndarray | None = None
328+ ) -> tuple [np .ndarray , np .ndarray ]:
282329 """
283330 Parameters
284331 ----------
@@ -315,27 +362,33 @@ def __init__(
315362 num_layers : int = 4 ,
316363 output_dim : int = 1 ,
317364 task_type : str = "regression" ,
318- seed : Optional [int ] = None
365+ seed : Optional [int ] = None ,
319366 ) -> None :
320367 self .rng = np .random .default_rng (seed )
321368 self .feature_dim = feature_dim
322369 self .d_model = d_model
323370 self .task_type = task_type
324371
325- self .w_in = self .rng .standard_normal ((feature_dim , d_model )) * math .sqrt (2.0 / (feature_dim + d_model ))
372+ self .w_in = self .rng .standard_normal ((feature_dim , d_model )) * math .sqrt (
373+ 2.0 / (feature_dim + d_model )
374+ )
326375 self .b_in = np .zeros ((d_model ,))
327376
328377 self .time2vec = Time2Vec (d_model , seed )
329378 self .encoder = TransformerEncoder (d_model , n_head , hidden_dim , num_layers , seed )
330379 self .pooling = AttentionPooling (d_model , seed )
331380
332- self .w_out = self .rng .standard_normal ((d_model , output_dim )) * math .sqrt (2.0 / (d_model + output_dim ))
381+ self .w_out = self .rng .standard_normal ((d_model , output_dim )) * math .sqrt (
382+ 2.0 / (d_model + output_dim )
383+ )
333384 self .b_out = np .zeros ((output_dim ,))
334385
335386 def _input_proj (self , features : np .ndarray ) -> np .ndarray :
336387 return np .tensordot (features , self .w_in , axes = ([2 ], [0 ])) + self .b_in
337388
338- def forward (self , features : np .ndarray , mask : np .ndarray | None = None ) -> tuple [np .ndarray , np .ndarray ]:
389+ def forward (
390+ self , features : np .ndarray , mask : np .ndarray | None = None
391+ ) -> tuple [np .ndarray , np .ndarray ]:
339392 """
340393 Parameters
341394 ----------
0 commit comments