Skip to content

Commit 745fa0e

Browse files
piDackCISC
andauthored
model : add glm-asr support (#17901)
* [model] add glm-asr support * fix format for ci * fix convert format for ci * update glm_asr convert script & use build_ffn for glm_asr clip & use build_stack for padding and review * check root architecture for convert hf script * fix conficlt with upstream * fix convert script for glm asr & format clip-impl * format * restore hparams text * improved conversion --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
1 parent 5239229 commit 745fa0e

File tree

7 files changed

+160
-13
lines changed

7 files changed

+160
-13
lines changed

convert_hf_to_gguf.py

Lines changed: 84 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -713,6 +713,9 @@ def load_hparams(dir_model: Path, is_mistral_format: bool):
713713
if "llm_config" in config:
714714
# rename for InternVL
715715
config["text_config"] = config["llm_config"]
716+
if "lm_config" in config:
717+
# rename for GlmASR
718+
config["text_config"] = config["lm_config"]
716719
if "thinker_config" in config:
717720
# rename for Qwen2.5-Omni
718721
config["text_config"] = config["thinker_config"]["text_config"]
@@ -1529,6 +1532,21 @@ def _try_set_pooling_type(self) -> None:
15291532
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
15301533
self.gguf_writer.add_pooling_type(pooling_type)
15311534

1535+
def _set_vocab_glmedge(self):
1536+
from transformers import AutoTokenizer
1537+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
1538+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
1539+
tokens, toktypes, tokpre = self.get_vocab_base()
1540+
self.gguf_writer.add_tokenizer_model("gpt2")
1541+
self.gguf_writer.add_tokenizer_pre(tokpre)
1542+
self.gguf_writer.add_token_list(tokens)
1543+
self.gguf_writer.add_token_types(toktypes)
1544+
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
1545+
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
1546+
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
1547+
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
1548+
special_vocab.add_to_gguf(self.gguf_writer)
1549+
15321550
def _set_vocab_interns1(self):
15331551
tokens: list[str] = []
15341552
toktypes: list[int] = []
@@ -1658,7 +1676,7 @@ class MmprojModel(ModelBase):
16581676
preprocessor_config: dict[str, Any]
16591677
global_config: dict[str, Any]
16601678

1661-
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
1679+
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"]
16621680

16631681
has_vision_encoder: bool = True # by default
16641682
has_audio_encoder: bool = False
@@ -1734,7 +1752,8 @@ def get_vision_config(self) -> dict[str, Any] | None:
17341752
return self.global_config.get(config_name)
17351753

17361754
def get_audio_config(self) -> dict[str, Any] | None:
1737-
return self.global_config.get("audio_config")
1755+
mm_config_key = "whisper_config" if "whisper_config" in self.hparams else "audio_config"
1756+
return self.global_config.get(mm_config_key)
17381757

17391758
def set_type(self):
17401759
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
@@ -2372,8 +2391,13 @@ def __init__(self, *args, **kwargs):
23722391
# fix for SmolVLM2, missing `num_attention_heads` in config.json
23732392
if self.hf_arch == "VLlama3ForCausalLM":
23742393
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
2394+
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
2395+
self.origin_hf_arch = hparams.get('architectures', [None])[0]
23752396

23762397
def set_vocab(self):
2398+
if self.origin_hf_arch == "GlmasrModel":
2399+
return self._set_vocab_glmedge()
2400+
23772401
if self.is_mistral_format:
23782402
return self._set_vocab_mistral()
23792403

@@ -2444,6 +2468,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
24442468
"vision_language_adapter.",
24452469
"patch_merger.",
24462470
"pre_mm_projector_norm",
2471+
"audio_encoder.",
24472472
]
24482473

24492474
is_multimodal_tensor = "vision_tower" in name \
@@ -8846,6 +8871,63 @@ def __init__(self, *args, **kwargs):
88468871
raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
88478872

88488873

8874+
@ModelBase.register("GlmasrModel")
8875+
class GlmASRWhisperEncoderModel(MmprojModel):
8876+
has_vision_encoder = False
8877+
has_audio_encoder = True
8878+
8879+
def __init__(self, *args, **kwargs):
8880+
super().__init__(*args, **kwargs)
8881+
if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
8882+
self.hparams["hidden_size"] = self.hparams["d_model"]
8883+
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
8884+
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
8885+
8886+
def set_gguf_parameters(self):
8887+
super().set_gguf_parameters()
8888+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLMA)
8889+
self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
8890+
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
8891+
self.gguf_writer.add_audio_stack_factor(self.global_config["merge_factor"])
8892+
8893+
def tensor_force_quant(self, name, new_name, bid, n_dims):
8894+
if ".conv" in name and ".weight" in name:
8895+
return gguf.GGMLQuantizationType.F16
8896+
return super().tensor_force_quant(name, new_name, bid, n_dims)
8897+
8898+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8899+
del bid # unused
8900+
8901+
if name.startswith("model.") or name.startswith("lm_head."):
8902+
# skip language model tensors
8903+
return []
8904+
8905+
if name.startswith("audio_encoder.whisper."):
8906+
name = name.replace("audio_encoder.whisper.","audio_tower.")
8907+
if "audio_encoder.layer_norm." in name or "audio_encoder.proj." in name:
8908+
name = name.replace("audio_encoder.", "audio_encoder.adapting.")
8909+
8910+
if name.startswith("audio_encoder.audio_bos_eos_token."):
8911+
return [(self.map_tensor_name("model.vision.boi"), data_torch[0]), (self.map_tensor_name("model.vision.eoi"), data_torch[1])]
8912+
8913+
if name.startswith("audio_encoder.adapting."):
8914+
name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.")
8915+
if ".layer_norm." in name:
8916+
name = name.replace(".layer_norm.", ".ln_pre.")
8917+
if ".0." in name:
8918+
name = name.replace(".0.", ".linear_1.")
8919+
if ".2." in name:
8920+
name = name.replace(".2.", ".linear_2.")
8921+
if ".proj." in name:
8922+
return []
8923+
8924+
if "conv1.bias" in name or "conv2.bias" in name:
8925+
# transpose conv1 and conv2 bias
8926+
data_torch = data_torch.unsqueeze(-1)
8927+
8928+
return [(self.map_tensor_name(name), data_torch)]
8929+
8930+
88498931
@ModelBase.register("Qwen2AudioForConditionalGeneration")
88508932
class WhisperEncoderModel(MmprojModel):
88518933
has_vision_encoder = False # no vision encoder

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3320,6 +3320,7 @@ class VisionProjectorType:
33203320
ULTRAVOX = "ultravox"
33213321
INTERNVL = "internvl"
33223322
QWEN2A = "qwen2a" # audio
3323+
GLMA = "glma" # audio
33233324
QWEN25O = "qwen2.5o" # omni
33243325
VOXTRAL = "voxtral"
33253326
LFM2 = "lfm2"

tools/mtmd/clip-graph.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,4 +112,8 @@ struct clip_graph {
112112
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
113113
// support dynamic resolution
114114
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
115+
116+
// Generic function to stack frames for audio processing
117+
// Abstracts out the StackAudioFrames logic used by ultravox
118+
ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed);
115119
};

tools/mtmd/clip-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ enum projector_type {
157157
PROJECTOR_TYPE_INTERNVL,
158158
PROJECTOR_TYPE_LLAMA4,
159159
PROJECTOR_TYPE_QWEN2A,
160+
PROJECTOR_TYPE_GLMA,
160161
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
161162
PROJECTOR_TYPE_VOXTRAL,
162163
PROJECTOR_TYPE_LFM2,
@@ -183,6 +184,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
183184
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
184185
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
185186
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
187+
{ PROJECTOR_TYPE_GLMA, "glma"},
186188
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
187189
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
188190
{ PROJECTOR_TYPE_LFM2, "lfm2"},

tools/mtmd/clip-model.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ struct clip_model {
256256
ggml_tensor * conv1d_2_w = nullptr;
257257
ggml_tensor * conv1d_2_b = nullptr;
258258
ggml_tensor * mm_norm_pre_w = nullptr;
259+
ggml_tensor * mm_norm_pre_b = nullptr;
259260
ggml_tensor * mm_norm_mid_w = nullptr;
260261

261262
// cogvlm

tools/mtmd/clip.cpp

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,32 @@ ggml_tensor * clip_graph::build_rope_2d(
720720
return cur;
721721
}
722722

723+
// Generic function to stack frames for audio processing
724+
// Abstracts out the StackAudioFrames logic used by ultravox
725+
ggml_tensor * clip_graph::build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
726+
if (stack_factor <= 1) {
727+
return cur;
728+
}
729+
730+
int64_t total_elements = ggml_nelements(cur);
731+
int64_t stride = n_embed * stack_factor;
732+
733+
// Calculate padded length
734+
int64_t padded_len = GGML_PAD(total_elements, stride);
735+
int64_t pad = padded_len - total_elements;
736+
737+
if (pad > 0) {
738+
// Pad the tensor to make it divisible by stride
739+
cur = ggml_view_1d(ctx0, cur, total_elements, 0);
740+
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
741+
}
742+
743+
// Reshape to [stride, padded_len / stride]
744+
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
745+
ggml_row_size(cur->type, stride), 0);
746+
return cur;
747+
}
748+
723749
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
724750
// support dynamic resolution
725751
ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
@@ -796,6 +822,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
796822
case PROJECTOR_TYPE_ULTRAVOX:
797823
case PROJECTOR_TYPE_VOXTRAL:
798824
case PROJECTOR_TYPE_QWEN2A:
825+
case PROJECTOR_TYPE_GLMA:
799826
{
800827
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
801828
} break;
@@ -1136,10 +1163,12 @@ struct clip_model_loader {
11361163
} break;
11371164
case PROJECTOR_TYPE_ULTRAVOX:
11381165
case PROJECTOR_TYPE_QWEN2A:
1166+
case PROJECTOR_TYPE_GLMA:
11391167
case PROJECTOR_TYPE_VOXTRAL:
11401168
{
11411169
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
1142-
model.proj_type == PROJECTOR_TYPE_VOXTRAL;
1170+
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
1171+
model.proj_type == PROJECTOR_TYPE_GLMA;
11431172
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
11441173
if (hparams.n_mel_bins != 128) {
11451174
throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
@@ -1510,6 +1539,21 @@ struct clip_model_loader {
15101539
model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
15111540
model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
15121541
} break;
1542+
case PROJECTOR_TYPE_GLMA:
1543+
{
1544+
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
1545+
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
1546+
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
1547+
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
1548+
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
1549+
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
1550+
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
1551+
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
1552+
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
1553+
model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
1554+
model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight"));
1555+
model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight"));
1556+
} break;
15131557
case PROJECTOR_TYPE_LLAMA4:
15141558
{
15151559
model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
@@ -2895,6 +2939,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
28952939
n_patches /= 2;
28962940
}
28972941
} break;
2942+
case PROJECTOR_TYPE_GLMA:
2943+
{
2944+
n_patches = img->nx;
2945+
// whisper downscales input token by half after conv1d
2946+
n_patches /= 2;
2947+
// reshape by merge_factor
2948+
n_patches /= ctx->model.hparams.proj_stack_factor;
2949+
// for BOI and EOI token embeddings
2950+
n_patches += 2;
2951+
} break;
28982952
case PROJECTOR_TYPE_COGVLM:
28992953
{
29002954
n_patches += 2; // for BOI and EOI token embeddings
@@ -3230,6 +3284,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
32303284
case PROJECTOR_TYPE_IDEFICS3:
32313285
case PROJECTOR_TYPE_INTERNVL:
32323286
case PROJECTOR_TYPE_QWEN2A:
3287+
case PROJECTOR_TYPE_GLMA:
32333288
case PROJECTOR_TYPE_ULTRAVOX:
32343289
case PROJECTOR_TYPE_LFM2:
32353290
case PROJECTOR_TYPE_VOXTRAL:
@@ -3340,6 +3395,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
33403395
return ctx->model.mm_model_proj->ne[1];
33413396
case PROJECTOR_TYPE_QWEN2A:
33423397
return ctx->model.mm_fc_w->ne[1];
3398+
case PROJECTOR_TYPE_GLMA:
3399+
return ctx->model.mm_2_w->ne[1];
33433400
case PROJECTOR_TYPE_LFM2:
33443401
case PROJECTOR_TYPE_KIMIVL:
33453402
return ctx->model.mm_2_w->ne[1];
@@ -3386,6 +3443,7 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
33863443
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
33873444
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
33883445
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
3446+
|| ctx->proj_type() == PROJECTOR_TYPE_GLMA
33893447
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
33903448
}
33913449

tools/mtmd/models/whisper-enc.cpp

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
3030
GGML_ASSERT(model.layers[0].q_b);
3131
GGML_ASSERT(model.layers[0].v_b);
3232
GGML_ASSERT(!model.layers[0].k_b); // no bias for k
33-
GGML_ASSERT(model.post_ln_w && model.post_ln_b);
3433

3534
ggml_tensor * pos_embd_selected = ggml_view_2d(
3635
ctx0, model.position_embeddings,
@@ -49,15 +48,7 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
4948
if (model.audio_has_stack_frames()) {
5049
// StackAudioFrames
5150
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
52-
int64_t stride = n_embd * hparams.proj_stack_factor;
53-
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
54-
int64_t pad = padded_len - ggml_nelements(cur);
55-
if (pad > 0) {
56-
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
57-
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
58-
}
59-
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
60-
ggml_row_size(cur->type, stride), 0);
51+
cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
6152
cb(cur, "after_stacked", -1);
6253
}
6354

@@ -95,6 +86,14 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
9586
FFN_GELU_ERF,
9687
-1);
9788

89+
} else if (proj_type == PROJECTOR_TYPE_GLMA) {
90+
cur = ggml_norm(ctx0, cur, hparams.eps);
91+
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
92+
cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
93+
cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
94+
cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0);
95+
cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
96+
cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
9897
} else {
9998
GGML_ABORT("%s: unknown projector type", __func__);
10099
}

0 commit comments

Comments
 (0)