Skip to content

mtmd : add ultravox audio input #13623

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
May 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
4fa0c27
convert ok, load ok
ngxson May 4, 2025
8b73116
warmup ok
ngxson May 4, 2025
4ac7940
test
ngxson May 4, 2025
4282465
still does not work?
ngxson May 4, 2025
45cdb7f
fix padding
ngxson May 4, 2025
f3605b9
temporary give up
ngxson May 4, 2025
1804fa2
Merge branch 'master' into xsn/mtmd_ultravox
ngxson May 18, 2025
bc708b4
fix merge conflict
ngxson May 18, 2025
de20afd
build_ultravox()
ngxson May 18, 2025
bbe4940
rm test
ngxson May 19, 2025
4d44460
Merge branch 'master' into xsn/mtmd_ultravox
ngxson May 19, 2025
8d7d75a
fix merge conflict
ngxson May 19, 2025
dce799d
add necessary mtmd APIs
ngxson May 19, 2025
f151854
first working version (only 4s of audio)
ngxson May 19, 2025
9a0dcb6
will this monster compile?
ngxson May 19, 2025
1a90395
fix compile
ngxson May 19, 2025
4a8c092
please compile
ngxson May 19, 2025
6f23ad1
fPIC
ngxson May 19, 2025
cf38b47
fix windows
ngxson May 19, 2025
cf4f5d2
various fixes
ngxson May 19, 2025
3bbb26b
clean up audio_helpers
ngxson May 20, 2025
3ce96d7
fix conversion
ngxson May 20, 2025
cf9613f
add some debug stuff
ngxson May 20, 2025
23d0d7f
long audio input ok
ngxson May 21, 2025
7033aa1
adapt the api
ngxson May 21, 2025
e7c8a2e
Merge branch 'master' into xsn/mtmd_ultravox
ngxson May 21, 2025
111c820
add --audio arg
ngxson May 21, 2025
e6416b0
final touch UX
ngxson May 21, 2025
36a1abb
add miniaudio to readme
ngxson May 21, 2025
544f4f1
fix typo
ngxson May 21, 2025
7602ee4
Merge branch 'master' into xsn/mtmd_ultravox
ngxson May 22, 2025
9afb3af
refactor kv metadata
ngxson May 22, 2025
107790a
mtmd_default_marker()
ngxson May 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,7 @@ end_of_line = unset
charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset

[tools/mtmd/miniaudio.h]
trim_trailing_whitespace = unset
insert_final_newline = unset
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -580,3 +580,4 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
10 changes: 5 additions & 5 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
using json = nlohmann::ordered_json;

std::initializer_list<enum llama_example> mmproj_examples = {
LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_MTMD,
LLAMA_EXAMPLE_SERVER,
};

Expand Down Expand Up @@ -2233,12 +2233,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
add_opt(common_arg(
{"--image"}, "FILE",
"path to an image file. use with multimodal models. Specify multiple times for batching",
{"--image", "--audio"}, "FILE",
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
[](common_params & params, const std::string & value) {
params.image.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_LLAVA}));
).set_examples({LLAMA_EXAMPLE_MTMD}));
if (llama_supports_rpc()) {
add_opt(common_arg(
{"--rpc"}, "SERVERS",
Expand Down Expand Up @@ -2868,7 +2868,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.chat_template = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
add_opt(common_arg(
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
string_format(
Expand Down
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ enum llama_example {
LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
LLAMA_EXAMPLE_EXPORT_LORA,
LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_MTMD,
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
Expand Down
160 changes: 118 additions & 42 deletions convert_hf_to_gguf.py

Large diffs are not rendered by default.

14 changes: 13 additions & 1 deletion docs/multimodal.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools
- [llama-mtmd-cli](../tools/mtmd/README.md)
- [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API

To enable it, can use use one of the 2 methods below:
Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.

To enable it, you can use one of the 2 methods below:

- Use `-hf` option with a supported model (see a list of pre-quantized model below)
- To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
Expand Down Expand Up @@ -37,6 +39,8 @@ Replaces the `(tool_name)` with the name of binary you want to use. For example,

NOTE: some models may require large context window, for example: `-c 8192`

**Vision models**:

```sh
# Gemma 3
(tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
Expand Down Expand Up @@ -78,3 +82,11 @@ NOTE: some models may require large context window, for example: `-c 8192`
# Llama 4 Scout
(tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
```

**Audio models**:

```sh
# Ultravox 0.5
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
```
83 changes: 76 additions & 7 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,10 +219,13 @@ class Adapter:
TYPE = "adapter.type"
LORA_ALPHA = "adapter.lora.alpha"

class ClipVision:
class Clip:
PROJECTOR_TYPE = "clip.projector_type"
HAS_VISION_ENCODER = "clip.has_vision_encoder"
HAS_AUDIO_ENCODER = "clip.has_audio_encoder"
HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"

class ClipVision:
IMAGE_SIZE = "clip.vision.image_size"
PATCH_SIZE = "clip.vision.patch_size"
EMBEDDING_LENGTH = "clip.vision.embedding_length"
Expand All @@ -243,19 +246,33 @@ class Attention:
class Projector:
SCALE_FACTOR = "clip.vision.projector.scale_factor"

class ClipAudio:
NUM_MEL_BINS = "clip.audio.num_mel_bins"
EMBEDDING_LENGTH = "clip.audio.embedding_length"
FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
PROJECTION_DIM = "clip.audio.projection_dim"
BLOCK_COUNT = "clip.audio.block_count"

class Attention:
HEAD_COUNT = "clip.audio.attention.head_count"
LAYERNORM_EPS = "clip.audio.attention.layer_norm_epsilon"

class Projector:
STACK_FACTOR = "clip.audio.projector.stack_factor"

#
# recommended mapping of model tensor names for storage in gguf
#


class GGUFType:
MODEL = "model"
ADAPTER = "adapter"
CLIP_VISION = "clip-vision"
MODEL = "model"
ADAPTER = "adapter"
MMPROJ = "mmproj" # dummy, unused for now


class MODEL_ARCH(IntEnum):
CLIP_VISION = auto() # dummy arch for clip.cpp
MMPROJ = auto() # dummy arch for clip.cpp
LLAMA = auto()
LLAMA4 = auto()
DECI = auto()
Expand Down Expand Up @@ -514,10 +531,27 @@ class MODEL_TENSOR(IntEnum):
V_RESMPL_QUERY = auto() # minicpmv
V_TOK_EMBD_IMG_BREAK = auto() # pixtral
V_MM_PATCH_MERGER = auto() # mistral small 3.1
# audio (mtmd)
A_ENC_EMBD_POS = auto()
A_ENC_CONV1D = auto()
A_PRE_NORM = auto()
A_POST_NORM = auto()
A_ENC_ATTN_Q = auto()
A_ENC_ATTN_K = auto()
A_ENC_ATTN_V = auto()
A_ENC_INPUT_NORM = auto()
A_ENC_OUTPUT = auto()
A_ENC_OUTPUT_NORM = auto()
A_ENC_FFN_UP = auto()
A_ENC_FFN_GATE = auto()
A_ENC_FFN_DOWN = auto()
A_MMPROJ = auto()
A_MM_NORM_PRE = auto()
A_MM_NORM_MID = auto()


MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.CLIP_VISION: "clip", # dummy arch for clip.cpp
MODEL_ARCH.MMPROJ: "clip", # dummy arch for clip.cpp
MODEL_ARCH.LLAMA: "llama",
MODEL_ARCH.LLAMA4: "llama4",
MODEL_ARCH.DECI: "deci",
Expand Down Expand Up @@ -776,10 +810,27 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query",
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral
MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1
# audio (mtmd)
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
MODEL_TENSOR.A_POST_NORM: "a.post_ln",
MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q",
MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k",
MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v",
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2",
MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up",
MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate",
MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down",
MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}",
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
}

MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_ARCH.CLIP_VISION: [
MODEL_ARCH.MMPROJ: [
MODEL_TENSOR.V_MMPROJ,
MODEL_TENSOR.V_MMPROJ_FC,
MODEL_TENSOR.V_MMPROJ_MLP,
Expand Down Expand Up @@ -819,6 +870,23 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_RESMPL_QUERY,
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
MODEL_TENSOR.V_MM_PATCH_MERGER,
# audio
MODEL_TENSOR.A_ENC_EMBD_POS,
MODEL_TENSOR.A_ENC_CONV1D,
MODEL_TENSOR.A_PRE_NORM,
MODEL_TENSOR.A_POST_NORM,
MODEL_TENSOR.A_ENC_ATTN_Q,
MODEL_TENSOR.A_ENC_ATTN_K,
MODEL_TENSOR.A_ENC_ATTN_V,
MODEL_TENSOR.A_ENC_INPUT_NORM,
MODEL_TENSOR.A_ENC_OUTPUT,
MODEL_TENSOR.A_ENC_OUTPUT_NORM,
MODEL_TENSOR.A_ENC_FFN_UP,
MODEL_TENSOR.A_ENC_FFN_GATE,
MODEL_TENSOR.A_ENC_FFN_DOWN,
MODEL_TENSOR.A_MMPROJ,
MODEL_TENSOR.A_MM_NORM_PRE,
MODEL_TENSOR.A_MM_NORM_MID,
],
MODEL_ARCH.LLAMA: [
MODEL_TENSOR.TOKEN_EMBD,
Expand Down Expand Up @@ -2186,6 +2254,7 @@ class VisionProjectorType:
LLAMA4 = "llama4"
QWEN2VL = "qwen2vl_merger"
QWEN25VL = "qwen2.5vl_merger"
ULTRAVOX = "ultravox"
INTERNVL = "internvl"


Expand Down
41 changes: 35 additions & 6 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -936,12 +936,18 @@ def add_eom_token_id(self, id: int) -> None:

# for vision models

def add_clip_has_vision_encoder(self, value: bool) -> None:
self.add_bool(Keys.Clip.HAS_VISION_ENCODER, value)

def add_clip_has_audio_encoder(self, value: bool) -> None:
self.add_bool(Keys.Clip.HAS_AUDIO_ENCODER, value)

def add_clip_projector_type(self, value: str) -> None:
self.add_string(Keys.Clip.PROJECTOR_TYPE, value)

def add_vision_projection_dim(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)

def add_vision_has_vision_encoder(self, value: bool) -> None:
self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value)

def add_vision_patch_size(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.PATCH_SIZE, value)

Expand All @@ -957,9 +963,6 @@ def add_vision_block_count(self, value: int) -> None:
def add_vision_head_count(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value)

def add_vision_projector_type(self, value: str) -> None:
self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)

def add_vision_attention_layernorm_eps(self, value: float) -> None:
self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value)

Expand Down Expand Up @@ -987,6 +990,32 @@ def add_vision_projector_scale_factor(self, value: int) -> None:
def add_vision_n_wa_pattern(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)

# audio models

def add_audio_projection_dim(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.PROJECTION_DIM, value)

def add_audio_embedding_length(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.EMBEDDING_LENGTH, value)

def add_audio_feed_forward_length(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.FEED_FORWARD_LENGTH, value)

def add_audio_block_count(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.BLOCK_COUNT, value)

def add_audio_head_count(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.Attention.HEAD_COUNT, value)

def add_audio_attention_layernorm_eps(self, value: float) -> None:
self.add_float32(Keys.ClipAudio.Attention.LAYERNORM_EPS, value)

def add_audio_num_mel_bins(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.NUM_MEL_BINS, value)

def add_audio_stack_factor(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)

def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
pack_prefix = ''
if not skip_pack_prefix:
Expand Down
62 changes: 62 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1110,6 +1110,68 @@ class TensorNameMap:
MODEL_TENSOR.V_MM_PATCH_MERGER: (
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1
),

# audio (mtmd)

MODEL_TENSOR.A_ENC_EMBD_POS: (
"audio_tower.embed_positions", # ultravox
),

MODEL_TENSOR.A_ENC_CONV1D: (
"audio_tower.conv{bid}", # ultravox
),

MODEL_TENSOR.A_PRE_NORM: (),

MODEL_TENSOR.A_POST_NORM: (
"audio_tower.layer_norm", # ultravox
),

MODEL_TENSOR.A_ENC_ATTN_Q: (
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
),

MODEL_TENSOR.A_ENC_ATTN_K: (
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
),

MODEL_TENSOR.A_ENC_ATTN_V: (
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
),

MODEL_TENSOR.A_ENC_INPUT_NORM: (
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
),

MODEL_TENSOR.A_ENC_OUTPUT: (
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
),

MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
),

MODEL_TENSOR.A_ENC_FFN_UP: (
"audio_tower.layers.{bid}.fc1", # ultravox
),

MODEL_TENSOR.A_ENC_FFN_GATE: (),

MODEL_TENSOR.A_ENC_FFN_DOWN: (
"audio_tower.layers.{bid}.fc2", # ultravox
),

MODEL_TENSOR.A_MMPROJ: (
"audio.multi_modal_projector.linear_{bid}", # ultravox
),

MODEL_TENSOR.A_MM_NORM_PRE: (
"audio.multi_modal_projector.ln_pre", # ultravox
),

MODEL_TENSOR.A_MM_NORM_MID: (
"audio.multi_modal_projector.ln_mid", # ultravox
),
}

# architecture-specific block mappings
Expand Down
Loading
Loading