Skip to content

Commit 96a181a

Browse files
ngxsontdakhran
andauthored
mtmd: refactor audio preprocessing (#17978)
* mtmd: refactor audio preprocessing * refactor Co-authored-by: Tarek <tdakhran@users.noreply.github.com> * wip * wip (2) * improve constructor * fix use_natural_log * fix padding for short input * clean up * remove need_chunking --------- Co-authored-by: Tarek <tdakhran@users.noreply.github.com>
1 parent 4a4f7e6 commit 96a181a

File tree

5 files changed

+383
-600
lines changed

5 files changed

+383
-600
lines changed

tools/mtmd/clip-model.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,13 @@ struct clip_hparams {
6565
int32_t n_mel_bins = 0; // whisper preprocessor
6666
int32_t proj_stack_factor = 0; // ultravox
6767

68+
// audio-to-mel preprocessor params
69+
int32_t audio_chunk_len = -1; // in seconds
70+
int32_t audio_sample_rate = -1;
71+
int32_t audio_n_fft = -1;
72+
int32_t audio_window_len = -1;
73+
int32_t audio_hop_len = -1;
74+
6875
// legacy
6976
bool has_llava_projector = false;
7077
int minicpmv_version = 0;
@@ -278,3 +285,5 @@ struct clip_model {
278285
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
279286
}
280287
};
288+
289+
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx);

tools/mtmd/clip.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,11 +1170,15 @@ struct clip_model_loader {
11701170
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
11711171
model.proj_type == PROJECTOR_TYPE_GLMA;
11721172
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
1173-
if (hparams.n_mel_bins != 128) {
1174-
throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
1175-
}
11761173
hparams.ffn_op = FFN_GELU_ERF;
11771174
log_ffn_op = "gelu_erf"; // temporary solution for logging
1175+
1176+
// audio preprocessing params
1177+
hparams.audio_chunk_len = 30; // in seconds
1178+
hparams.audio_sample_rate = 16000;
1179+
hparams.audio_n_fft = 400;
1180+
hparams.audio_window_len = 400;
1181+
hparams.audio_hop_len = 160;
11781182
} break;
11791183
default:
11801184
break;
@@ -1212,6 +1216,11 @@ struct clip_model_loader {
12121216
LOG_INF("\n--- audio hparams ---\n");
12131217
LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
12141218
LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
1219+
LOG_INF("%s: audio_chunk_len: %d\n", __func__, hparams.audio_chunk_len);
1220+
LOG_INF("%s: audio_sample_rate: %d\n", __func__, hparams.audio_sample_rate);
1221+
LOG_INF("%s: audio_n_fft: %d\n", __func__, hparams.audio_n_fft);
1222+
LOG_INF("%s: audio_window_len: %d\n", __func__, hparams.audio_window_len);
1223+
LOG_INF("%s: audio_hop_len: %d\n", __func__, hparams.audio_hop_len);
12151224
}
12161225
LOG_INF("\n");
12171226
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
@@ -3478,3 +3487,7 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
34783487
batch->entries.push_back(clip_image_f32_ptr(audio));
34793488
batch->is_audio = true;
34803489
}
3490+
3491+
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
3492+
return &ctx->model.hparams;
3493+
}

0 commit comments

Comments
 (0)