@@ -538,11 +538,9 @@ static ggml_tensor *apply_rotary_emb_basic(ModelContext *mctx, ggml_tensor *laye
538538 // tensor a (activation) is of shape [s, #h, d]
539539 // tensor b (position_ids) is of shape [s]
540540 ggml_context *ctx = mctx->ctx_b .get ();
541- #ifdef GGML_USE_CUDA
542- if (!ggml_is_contiguous (layer)) {
541+ if (ggml_cpu_has_cuda () && !ggml_is_contiguous (layer)) {
543542 layer = ggml_cont (ctx, layer);
544543 }
545- #endif
546544 const int head_size = layer->ne [0 ];
547545 layer = ggml_rope_ext_inplace (ctx, layer, position_ids, nullptr , head_size, (int )rope_type, 0 , rope_theta, 1 .0f ,
548546 0 .0f , 1 .0f , 0 .0f , 0 .0f ); // [s, #h, d]
@@ -568,18 +566,20 @@ static ggml_tensor *apply_rotary_emb_glm(ModelContext *mctx, ggml_tensor *layer,
568566
569567 ggml_tensor *a1_rope = a1;
570568 ggml_tensor *a2_rope = a2;
571- #ifdef GGML_USE_CUDA
572- a1_rope = ggml_cont (ctx, a1_rope);
573- a2_rope = ggml_cont (ctx, a2_rope);
574- #endif
569+
570+ if (ggml_cpu_has_cuda ()) {
571+ a1_rope = ggml_cont (ctx, a1_rope);
572+ a2_rope = ggml_cont (ctx, a2_rope);
573+ }
575574
576575 a1_rope = ggml_rope_inplace (ctx, a1_rope, b1, rope_dim, (int )RopeType::NEOX); // [s, #h, d/2]
577576 a2_rope = ggml_rope_inplace (ctx, a2_rope, b2, rope_dim, (int )RopeType::NEOX); // [s, #h, d/2]
578577
579- #ifdef GGML_USE_CUDA
580- a1_rope = ggml_cpy (ctx, a1_rope, a1);
581- a2_rope = ggml_cpy (ctx, a2_rope, a2);
582- #endif
578+ if (ggml_cpu_has_cuda ()) {
579+ a1_rope = ggml_cpy (ctx, a1_rope, a1);
580+ a2_rope = ggml_cpy (ctx, a2_rope, a2);
581+ }
582+
583583 ggml_build_forward_expand (mctx->gf , a1_rope);
584584 ggml_build_forward_expand (mctx->gf , a2_rope);
585585
@@ -599,15 +599,15 @@ static ggml_tensor *apply_rotary_emb_glm2(ModelContext *mctx, ggml_tensor *layer
599599 ggml_view_3d (ctx, layer, rope_dim, layer->ne [1 ], layer->ne [2 ], layer->nb [1 ], layer->nb [2 ], 0 );
600600
601601 ggml_tensor *half_layer = half_layer_view;
602- # ifdef GGML_USE_CUDA
603- half_layer = ggml_cont (ctx, half_layer);
604- # endif
602+ if ( ggml_cpu_has_cuda ()) {
603+ half_layer = ggml_cont (ctx, half_layer);
604+ }
605605 ggml_tensor *roped_half_layer =
606606 ggml_rope_ext_inplace (ctx, half_layer, position_ids, nullptr , rope_dim, (int )RopeType::GPTJ, 0 , rope_theta,
607607 1 .0f , 0 .0f , 1 .0f , 0 .0f , 0 .0f ); // [s, #h, d]
608- # ifdef GGML_USE_CUDA
609- roped_half_layer = ggml_cpy (ctx, roped_half_layer, half_layer_view);
610- # endif
608+ if ( ggml_cpu_has_cuda ()) {
609+ roped_half_layer = ggml_cpy (ctx, roped_half_layer, half_layer_view);
610+ }
611611 ggml_build_forward_expand (mctx->gf , roped_half_layer);
612612
613613 return layer;
@@ -677,6 +677,7 @@ ggml_tensor *BasicAttention::forward(ModelContext *mctx, ggml_tensor *hidden_sta
677677 key_layer = ggml_permute (ctx, key_layer, 0 , 2 , 1 , 3 ); // [#kvh, s, d]
678678 value_layer = ggml_permute (ctx, value_layer, 1 , 2 , 0 , 3 ); // [#kvh, d, s]
679679
680+ ggml_tensor *context_layer;
680681 if (k_cache && v_cache) {
681682 // store key & value to cache
682683 ggml_tensor *k_cache_view =
@@ -695,46 +696,47 @@ ggml_tensor *BasicAttention::forward(ModelContext *mctx, ggml_tensor *hidden_sta
695696 value_layer = ggml_view_3d (ctx, v_cache, num_virtual_tokens + n_past + qlen, head_size, num_key_value_heads,
696697 v_cache->nb [1 ], v_cache->nb [2 ],
697698 0 ); // [#kvh, d, kvs]
698- } else {
699- key_layer = ggml_cont (ctx, key_layer);
700- value_layer = ggml_cont (ctx, value_layer);
701- }
702699
703- // attention
704- query_layer = ggml_scale_inplace (ctx, query_layer, 1 .f / std::sqrt (head_size));
705- ggml_tensor *attn_scores = ggml_mul_mat (ctx, key_layer, query_layer); // [#kvh, (#h/#kvh) * s, kvs]
700+ // attention
701+ query_layer = ggml_scale_inplace (ctx, query_layer, 1 .f / std::sqrt (head_size));
702+ ggml_tensor *attn_scores = ggml_mul_mat (ctx, key_layer, query_layer); // [#kvh, (#h/#kvh) * s, kvs]
706703
707- if (n_past == 0 ) {
708- // build attention mask for context input
709- if (num_shared_q_heads > 1 ) {
710- attn_scores = ggml_reshape_3d (ctx, attn_scores, num_virtual_tokens + n_past + qlen, qlen,
711- num_attention_heads); // [#h, s, kvs]
712- }
704+ if (n_past == 0 ) {
705+ // build attention mask for context input
706+ if (num_shared_q_heads > 1 ) {
707+ attn_scores = ggml_reshape_3d (ctx, attn_scores, num_virtual_tokens + n_past + qlen, qlen,
708+ num_attention_heads); // [#h, s, kvs]
709+ }
713710
714- if (attn_mask_type == AttentionMaskType::BIDIRECTIONAL) {
715- // pass
716- } else if (attn_mask_type == AttentionMaskType::CAUSAL) {
717- attn_scores = ggml_diag_mask_inf_inplace (ctx, attn_scores, num_virtual_tokens + n_past);
718- } else {
719- attn_scores = ggml_add_inplace (ctx, attn_scores, attention_mask);
711+ if (attention_mask) {
712+ attn_scores = ggml_add_inplace (ctx, attn_scores, attention_mask);
713+ }
714+
715+ if (num_shared_q_heads > 1 ) {
716+ attn_scores =
717+ ggml_reshape_3d (ctx, attn_scores, num_virtual_tokens + n_past + qlen, num_shared_q_heads * qlen,
718+ num_key_value_heads); // [#kvh, (#h/#kvh) * s, kvs]
719+ }
720720 }
721721
722+ ggml_tensor *attn_probs = ggml_soft_max_inplace (ctx, attn_scores); // [#kvh, (#h/#kvh) * s, kvs]
723+
724+ context_layer = ggml_mul_mat (ctx, value_layer, attn_probs); // [#kvh, (#h/#kvh) * s, d]
722725 if (num_shared_q_heads > 1 ) {
723- attn_scores =
724- ggml_reshape_3d (ctx, attn_scores, num_virtual_tokens + n_past + qlen, num_shared_q_heads * qlen,
725- num_key_value_heads); // [#kvh, (#h/#kvh) * s, kvs]
726+ context_layer = ggml_reshape_3d (ctx, context_layer, head_size, qlen,
727+ num_attention_heads); // [#h, s, d]
726728 }
729+ context_layer = ggml_cont (ctx, ggml_permute (ctx, context_layer, 0 , 2 , 1 , 3 )); // [s, #h, d]
730+ } else {
731+ // qkv must be correctly padded
732+ key_layer = ggml_cast (ctx, key_layer, GGML_TYPE_F16); // [#kvh, s, d]
733+ value_layer = ggml_cast (ctx, ggml_permute (ctx, value_layer, 1 , 0 , 2 , 3 ), GGML_TYPE_F16); // [#kvh, s, d]
734+ context_layer = ggml_flash_attn_ext (ctx, query_layer, key_layer, value_layer, attention_mask,
735+ 1 .f / std::sqrt (head_size), 0 );
736+ ggml_flash_attn_ext_set_prec (context_layer, GGML_PREC_F32);
727737 }
728738
729- ggml_tensor *attn_probs = ggml_soft_max_inplace (ctx, attn_scores); // [#kvh, (#h/#kvh) * s, kvs]
730-
731- ggml_tensor *context_layer = ggml_mul_mat (ctx, value_layer, attn_probs); // [#kvh, (#h/#kvh) * s, d]
732- if (num_shared_q_heads > 1 ) {
733- context_layer = ggml_reshape_3d (ctx, context_layer, head_size, qlen,
734- num_attention_heads); // [#h, s, d]
735- }
736- context_layer = ggml_cont (ctx, ggml_permute (ctx, context_layer, 0 , 2 , 1 , 3 )); // [s, #h, d]
737- context_layer = ggml_reshape_2d (ctx, context_layer, hidden_size, qlen); // [s, #h * d]
739+ context_layer = ggml_reshape_2d (ctx, context_layer, hidden_size, qlen); // [s, #h * d]
738740
739741 ggml_tensor *attn_output = dense.forward (mctx, context_layer);
740742 return attn_output;
@@ -1341,6 +1343,19 @@ void ChatGLM2Model::set_graph_inputs(ggml_cgraph *gf, const std::vector<int> &in
13411343 std::vector<int > position_ids_buffer (position_ids->ne [0 ]);
13421344 std::iota (position_ids_buffer.begin (), position_ids_buffer.end (), n_past);
13431345 ggml_backend_tensor_set (position_ids, position_ids_buffer.data (), 0 , position_ids_buffer.size () * sizeof (int ));
1346+
1347+ ggml_tensor *attention_mask = ggml_graph_get_tensor (gf, " attention_mask" );
1348+ if (attention_mask) {
1349+ const int kvlen = attention_mask->ne [0 ];
1350+ const int qlen = attention_mask->ne [1 ];
1351+ std::vector<float > mask_buf (qlen * kvlen);
1352+ for (int i = 0 ; i < qlen; i++) {
1353+ for (int j = 0 ; j < kvlen; j++) {
1354+ mask_buf[i * kvlen + j] = (i < j + qlen - kvlen) ? -INFINITY : 0 .f ;
1355+ }
1356+ }
1357+ ggml_backend_tensor_set (attention_mask, mask_buf.data (), 0 , ggml_nbytes (attention_mask));
1358+ }
13441359}
13451360
13461361StateDict ChatGLM2ForCausalLM::state_dict () const {
@@ -1827,14 +1842,14 @@ EVA2CLIPTransformer::EVA2CLIPTransformer(ModelContext *mctx, const VisionModelCo
18271842 for (int layer_id = 0 ; layer_id < config.num_hidden_layers ; layer_id++) {
18281843 layers.emplace_back (mctx, config.dtype , config.hidden_size , config.num_attention_heads ,
18291844 config.num_attention_heads , config.intermediate_size , config.num_positions , config.norm_eps ,
1830- config.hidden_act , true , true , false , RopeType::DISABLED, -1 ,
1831- AttentionMaskType::BIDIRECTIONAL, 0 , false );
1845+ config.hidden_act , true , true , false , RopeType::DISABLED, -1 , 0 , false );
18321846 }
18331847}
18341848
1835- ggml_tensor *EVA2CLIPTransformer::forward (ModelContext *mctx, ggml_tensor *hidden_states) const {
1849+ ggml_tensor *EVA2CLIPTransformer::forward (ModelContext *mctx, ggml_tensor *hidden_states,
1850+ ggml_tensor *attention_mask) const {
18361851 for (const auto &layer : layers) {
1837- hidden_states = layer.forward (mctx, hidden_states, nullptr , nullptr , 0 );
1852+ hidden_states = layer.forward (mctx, hidden_states, attention_mask , nullptr , 0 );
18381853 }
18391854 return hidden_states;
18401855}
@@ -1843,17 +1858,29 @@ ggml_tensor *EVA2CLIPModel::forward(ModelContext *mctx, ggml_tensor *input) cons
18431858 ggml_context *ctx = mctx->ctx_b .get ();
18441859
18451860 ggml_tensor *hidden_states = patch_embedding.forward (mctx, input);
1846- hidden_states = transformer.forward (mctx, hidden_states); // [s, hd]
18471861
1848- const int grid_size = std::round (std::sqrt (hidden_states->ne [1 ] - 1 ));
1862+ // padding for flash attn
1863+ const int pad_to_multiple_of = ggml_cpu_has_cuda () ? 256 : GGML_KQ_MASK_PAD;
1864+ const int pad_size = GGML_PAD (hidden_states->ne [1 ], pad_to_multiple_of) - hidden_states->ne [1 ];
1865+ if (pad_size) {
1866+ hidden_states = ggml_pad (ctx, hidden_states, 0 , pad_size, 0 , 0 );
1867+ }
1868+
1869+ ggml_tensor *encoder_attention_mask =
1870+ ggml_new_tensor_2d (ctx, GGML_TYPE_F32, hidden_states->ne [1 ], hidden_states->ne [1 ]);
1871+ ggml_set_input (encoder_attention_mask);
1872+ ggml_set_name (encoder_attention_mask, " encoder_attention_mask" );
1873+
1874+ encoder_attention_mask = ggml_cast (ctx, encoder_attention_mask, GGML_TYPE_F16);
1875+ hidden_states = transformer.forward (mctx, hidden_states, encoder_attention_mask); // [s, hd]
1876+
1877+ const int grid_size = std::round (std::sqrt (hidden_states->ne [1 ] - pad_size - 1 ));
18491878 hidden_states = ggml_view_3d (ctx, hidden_states, hidden_states->ne [0 ], grid_size, grid_size, hidden_states->nb [1 ],
18501879 grid_size * hidden_states->nb [1 ], hidden_states->nb [1 ]); // [g, g, hd]
1851- // TODO: must use this cont?
1852- hidden_states = ggml_cont (ctx, ggml_permute (ctx, hidden_states, 2 , 0 , 1 , 3 )); // [hd, g, g]
1853- hidden_states = conv.forward (mctx, hidden_states); // [hd, g/2, g/2]
1880+ hidden_states = ggml_cont (ctx, ggml_permute (ctx, hidden_states, 2 , 0 , 1 , 3 )); // [hd, g, g]
1881+ hidden_states = conv.forward (mctx, hidden_states); // [hd, g/2, g/2]
18541882 hidden_states = ggml_reshape_2d (ctx, hidden_states, hidden_states->ne [0 ] * hidden_states->ne [1 ],
1855- hidden_states->ne [2 ]); // [hd, s]
1856- // TODO: this cont?
1883+ hidden_states->ne [2 ]); // [hd, s]
18571884 hidden_states = ggml_cont (ctx, ggml_permute (ctx, hidden_states, 1 , 0 , 2 , 3 )); // [s, hd]
18581885
18591886 hidden_states = linear_proj.forward (mctx, hidden_states);
@@ -1967,6 +1994,38 @@ void ChatGLM4VModel::set_graph_inputs(ggml_cgraph *gf, const std::vector<int> &i
19671994 // copy to tensor
19681995 ggml_backend_tensor_set (image_tensor, pixels_f32.data (), 0 , ggml_nbytes (image_tensor));
19691996 }
1997+
1998+ // attention_mask
1999+ ggml_tensor *attention_mask = ggml_graph_get_tensor (gf, " attention_mask" );
2000+ if (attention_mask) {
2001+ const int kvlen = attention_mask->ne [0 ];
2002+ const int qlen = attention_mask->ne [1 ];
2003+ std::vector<float > mask_buf (qlen * kvlen);
2004+ for (int i = 0 ; i < qlen; i++) {
2005+ for (int j = 0 ; j < kvlen; j++) {
2006+ mask_buf[i * kvlen + j] = (i < j + qlen - kvlen) ? -INFINITY : 0 .f ;
2007+ }
2008+ }
2009+ ggml_backend_tensor_set (attention_mask, mask_buf.data (), 0 , ggml_nbytes (attention_mask));
2010+ }
2011+
2012+ // encoder_attention_mask
2013+ ggml_tensor *encoder_attention_mask = ggml_graph_get_tensor (gf, " encoder_attention_mask" );
2014+ if (encoder_attention_mask) {
2015+ const int valid_tokens = vision.patch_embedding .num_positions ();
2016+ const int M = encoder_attention_mask->ne [1 ];
2017+ const int N = encoder_attention_mask->ne [0 ];
2018+ std::vector<float > encoder_mask_f32 (M * N);
2019+ CHATGLM_CHECK ((size_t )ggml_nelements (encoder_attention_mask) == encoder_mask_f32.size ());
2020+ for (int i = 0 ; i < M; i++) {
2021+ for (int j = 0 ; j < N; j++) {
2022+ encoder_mask_f32[i * N + j] =
2023+ (i < valid_tokens && j < valid_tokens) ? 0 .f : -65504 .f ; // -INFINITY causes nan/inf logits
2024+ }
2025+ }
2026+ ggml_backend_tensor_set (encoder_attention_mask, encoder_mask_f32.data (), 0 ,
2027+ ggml_nbytes (encoder_attention_mask));
2028+ }
19702029}
19712030
19722031int ChatGLM4VForCausalLM::count_tokens (const std::vector<int > &input_ids, const std::optional<Image> &image) const {
0 commit comments