Skip to content

Commit 985d59f

Browse files
committed
tts : outetts-voc -> wavtokenizer-dec
1 parent f1b5b6b commit 985d59f

File tree

5 files changed

+198
-198
lines changed

5 files changed

+198
-198
lines changed

convert_hf_to_gguf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2032,9 +2032,9 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
20322032
yield name, data
20332033

20342034

2035-
@Model.register("OuteTTSVocoder")
2036-
class OuteTTSVocoderModel(Model):
2037-
model_arch = gguf.MODEL_ARCH.OUTETTS_VOC
2035+
@Model.register("WavTokenizerDec")
2036+
class WavTokenizerDecModel(Model):
2037+
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
20382038

20392039
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
20402040
del bid # unused

examples/tts/convert_pt_to_hf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format
2-
# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the OuteTTSS vocoder
2+
# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the WavTokenizer decoder
33
#
44
# TODO: this script is LLM-generated and probably very inefficient and should be rewritten
55

@@ -144,7 +144,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
144144

145145
config = {
146146
"architectures": [
147-
"OuteTTSVocoder"
147+
"WavTokenizerDec"
148148
],
149149
"hidden_size": 1282,
150150
"vocab_size": 4096,

gguf-py/gguf/constants.py

Lines changed: 107 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -209,59 +209,59 @@ class GGUFType:
209209

210210

211211
class MODEL_ARCH(IntEnum):
212-
LLAMA = auto()
213-
FALCON = auto()
214-
BAICHUAN = auto()
215-
GROK = auto()
216-
GPT2 = auto()
217-
GPTJ = auto()
218-
GPTNEOX = auto()
219-
MPT = auto()
220-
STARCODER = auto()
221-
REFACT = auto()
222-
BERT = auto()
223-
NOMIC_BERT = auto()
224-
JINA_BERT_V2 = auto()
225-
BLOOM = auto()
226-
STABLELM = auto()
227-
QWEN = auto()
228-
QWEN2 = auto()
229-
QWEN2MOE = auto()
230-
QWEN2VL = auto()
231-
PHI2 = auto()
232-
PHI3 = auto()
233-
PLAMO = auto()
234-
CODESHELL = auto()
235-
ORION = auto()
236-
INTERNLM2 = auto()
237-
MINICPM = auto()
238-
MINICPM3 = auto()
239-
GEMMA = auto()
240-
GEMMA2 = auto()
241-
STARCODER2 = auto()
242-
RWKV6 = auto()
243-
MAMBA = auto()
244-
XVERSE = auto()
245-
COMMAND_R = auto()
246-
DBRX = auto()
247-
OLMO = auto()
248-
OLMO2 = auto()
249-
OLMOE = auto()
250-
OPENELM = auto()
251-
ARCTIC = auto()
252-
DEEPSEEK = auto()
253-
DEEPSEEK2 = auto()
254-
CHATGLM = auto()
255-
BITNET = auto()
256-
T5 = auto()
257-
T5ENCODER = auto()
258-
JAIS = auto()
259-
NEMOTRON = auto()
260-
EXAONE = auto()
261-
GRANITE = auto()
262-
GRANITE_MOE = auto()
263-
CHAMELEON = auto()
264-
OUTETTS_VOC = auto()
212+
LLAMA = auto()
213+
FALCON = auto()
214+
BAICHUAN = auto()
215+
GROK = auto()
216+
GPT2 = auto()
217+
GPTJ = auto()
218+
GPTNEOX = auto()
219+
MPT = auto()
220+
STARCODER = auto()
221+
REFACT = auto()
222+
BERT = auto()
223+
NOMIC_BERT = auto()
224+
JINA_BERT_V2 = auto()
225+
BLOOM = auto()
226+
STABLELM = auto()
227+
QWEN = auto()
228+
QWEN2 = auto()
229+
QWEN2MOE = auto()
230+
QWEN2VL = auto()
231+
PHI2 = auto()
232+
PHI3 = auto()
233+
PLAMO = auto()
234+
CODESHELL = auto()
235+
ORION = auto()
236+
INTERNLM2 = auto()
237+
MINICPM = auto()
238+
MINICPM3 = auto()
239+
GEMMA = auto()
240+
GEMMA2 = auto()
241+
STARCODER2 = auto()
242+
RWKV6 = auto()
243+
MAMBA = auto()
244+
XVERSE = auto()
245+
COMMAND_R = auto()
246+
DBRX = auto()
247+
OLMO = auto()
248+
OLMO2 = auto()
249+
OLMOE = auto()
250+
OPENELM = auto()
251+
ARCTIC = auto()
252+
DEEPSEEK = auto()
253+
DEEPSEEK2 = auto()
254+
CHATGLM = auto()
255+
BITNET = auto()
256+
T5 = auto()
257+
T5ENCODER = auto()
258+
JAIS = auto()
259+
NEMOTRON = auto()
260+
EXAONE = auto()
261+
GRANITE = auto()
262+
GRANITE_MOE = auto()
263+
CHAMELEON = auto()
264+
WAVTOKENIZER_DEC = auto()
265265

266266

267267
class MODEL_TENSOR(IntEnum):
@@ -390,59 +390,59 @@ class MODEL_TENSOR(IntEnum):
390390

391391

392392
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
393-
MODEL_ARCH.LLAMA: "llama",
394-
MODEL_ARCH.FALCON: "falcon",
395-
MODEL_ARCH.BAICHUAN: "baichuan",
396-
MODEL_ARCH.GROK: "grok",
397-
MODEL_ARCH.GPT2: "gpt2",
398-
MODEL_ARCH.GPTJ: "gptj",
399-
MODEL_ARCH.GPTNEOX: "gptneox",
400-
MODEL_ARCH.MPT: "mpt",
401-
MODEL_ARCH.STARCODER: "starcoder",
402-
MODEL_ARCH.REFACT: "refact",
403-
MODEL_ARCH.BERT: "bert",
404-
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
405-
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
406-
MODEL_ARCH.BLOOM: "bloom",
407-
MODEL_ARCH.STABLELM: "stablelm",
408-
MODEL_ARCH.QWEN: "qwen",
409-
MODEL_ARCH.QWEN2: "qwen2",
410-
MODEL_ARCH.QWEN2MOE: "qwen2moe",
411-
MODEL_ARCH.QWEN2VL: "qwen2vl",
412-
MODEL_ARCH.PHI2: "phi2",
413-
MODEL_ARCH.PHI3: "phi3",
414-
MODEL_ARCH.PLAMO: "plamo",
415-
MODEL_ARCH.CODESHELL: "codeshell",
416-
MODEL_ARCH.ORION: "orion",
417-
MODEL_ARCH.INTERNLM2: "internlm2",
418-
MODEL_ARCH.MINICPM: "minicpm",
419-
MODEL_ARCH.MINICPM3: "minicpm3",
420-
MODEL_ARCH.GEMMA: "gemma",
421-
MODEL_ARCH.GEMMA2: "gemma2",
422-
MODEL_ARCH.STARCODER2: "starcoder2",
423-
MODEL_ARCH.RWKV6: "rwkv6",
424-
MODEL_ARCH.MAMBA: "mamba",
425-
MODEL_ARCH.XVERSE: "xverse",
426-
MODEL_ARCH.COMMAND_R: "command-r",
427-
MODEL_ARCH.DBRX: "dbrx",
428-
MODEL_ARCH.OLMO: "olmo",
429-
MODEL_ARCH.OLMO2: "olmo2",
430-
MODEL_ARCH.OLMOE: "olmoe",
431-
MODEL_ARCH.OPENELM: "openelm",
432-
MODEL_ARCH.ARCTIC: "arctic",
433-
MODEL_ARCH.DEEPSEEK: "deepseek",
434-
MODEL_ARCH.DEEPSEEK2: "deepseek2",
435-
MODEL_ARCH.CHATGLM: "chatglm",
436-
MODEL_ARCH.BITNET: "bitnet",
437-
MODEL_ARCH.T5: "t5",
438-
MODEL_ARCH.T5ENCODER: "t5encoder",
439-
MODEL_ARCH.JAIS: "jais",
440-
MODEL_ARCH.NEMOTRON: "nemotron",
441-
MODEL_ARCH.EXAONE: "exaone",
442-
MODEL_ARCH.GRANITE: "granite",
443-
MODEL_ARCH.GRANITE_MOE: "granitemoe",
444-
MODEL_ARCH.CHAMELEON: "chameleon",
445-
MODEL_ARCH.OUTETTS_VOC: "outetts-voc",
393+
MODEL_ARCH.LLAMA: "llama",
394+
MODEL_ARCH.FALCON: "falcon",
395+
MODEL_ARCH.BAICHUAN: "baichuan",
396+
MODEL_ARCH.GROK: "grok",
397+
MODEL_ARCH.GPT2: "gpt2",
398+
MODEL_ARCH.GPTJ: "gptj",
399+
MODEL_ARCH.GPTNEOX: "gptneox",
400+
MODEL_ARCH.MPT: "mpt",
401+
MODEL_ARCH.STARCODER: "starcoder",
402+
MODEL_ARCH.REFACT: "refact",
403+
MODEL_ARCH.BERT: "bert",
404+
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
405+
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
406+
MODEL_ARCH.BLOOM: "bloom",
407+
MODEL_ARCH.STABLELM: "stablelm",
408+
MODEL_ARCH.QWEN: "qwen",
409+
MODEL_ARCH.QWEN2: "qwen2",
410+
MODEL_ARCH.QWEN2MOE: "qwen2moe",
411+
MODEL_ARCH.QWEN2VL: "qwen2vl",
412+
MODEL_ARCH.PHI2: "phi2",
413+
MODEL_ARCH.PHI3: "phi3",
414+
MODEL_ARCH.PLAMO: "plamo",
415+
MODEL_ARCH.CODESHELL: "codeshell",
416+
MODEL_ARCH.ORION: "orion",
417+
MODEL_ARCH.INTERNLM2: "internlm2",
418+
MODEL_ARCH.MINICPM: "minicpm",
419+
MODEL_ARCH.MINICPM3: "minicpm3",
420+
MODEL_ARCH.GEMMA: "gemma",
421+
MODEL_ARCH.GEMMA2: "gemma2",
422+
MODEL_ARCH.STARCODER2: "starcoder2",
423+
MODEL_ARCH.RWKV6: "rwkv6",
424+
MODEL_ARCH.MAMBA: "mamba",
425+
MODEL_ARCH.XVERSE: "xverse",
426+
MODEL_ARCH.COMMAND_R: "command-r",
427+
MODEL_ARCH.DBRX: "dbrx",
428+
MODEL_ARCH.OLMO: "olmo",
429+
MODEL_ARCH.OLMO2: "olmo2",
430+
MODEL_ARCH.OLMOE: "olmoe",
431+
MODEL_ARCH.OPENELM: "openelm",
432+
MODEL_ARCH.ARCTIC: "arctic",
433+
MODEL_ARCH.DEEPSEEK: "deepseek",
434+
MODEL_ARCH.DEEPSEEK2: "deepseek2",
435+
MODEL_ARCH.CHATGLM: "chatglm",
436+
MODEL_ARCH.BITNET: "bitnet",
437+
MODEL_ARCH.T5: "t5",
438+
MODEL_ARCH.T5ENCODER: "t5encoder",
439+
MODEL_ARCH.JAIS: "jais",
440+
MODEL_ARCH.NEMOTRON: "nemotron",
441+
MODEL_ARCH.EXAONE: "exaone",
442+
MODEL_ARCH.GRANITE: "granite",
443+
MODEL_ARCH.GRANITE_MOE: "granitemoe",
444+
MODEL_ARCH.CHAMELEON: "chameleon",
445+
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
446446
}
447447

448448
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -1406,7 +1406,7 @@ class MODEL_TENSOR(IntEnum):
14061406
MODEL_TENSOR.FFN_DOWN,
14071407
MODEL_TENSOR.FFN_UP,
14081408
],
1409-
MODEL_ARCH.OUTETTS_VOC: [
1409+
MODEL_ARCH.WAVTOKENIZER_DEC: [
14101410
MODEL_TENSOR.TOKEN_EMBD,
14111411
MODEL_TENSOR.TOKEN_EMBD_NORM,
14121412
MODEL_TENSOR.CONV1D,

gguf-py/gguf/tensor_mapping.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class TensorNameMap:
4242
"emb_ln", # nomic-bert
4343
"transformer.norm", # openelm
4444
"rwkv.blocks.0.pre_ln", # rwkv
45-
"backbone.norm", # outetts
45+
"backbone.norm", # wavtokenizer
4646
),
4747

4848
# Position embeddings
@@ -61,7 +61,7 @@ class TensorNameMap:
6161
"lm_head.linear", # phi2
6262
"output_layer", # chatglm
6363
"head", # rwkv
64-
"head.out", # outetts
64+
"head.out", # wavtokenizer
6565
),
6666

6767
# Output norm
@@ -82,7 +82,7 @@ class TensorNameMap:
8282
"transformer.norm", # openelm
8383
"model.norm", # nemotron
8484
"rwkv.ln_out", # rwkv
85-
"backbone.final_layer_norm", # outetts
85+
"backbone.final_layer_norm", # wavtokenizer
8686
),
8787

8888
# Rope frequencies
@@ -705,63 +705,63 @@ class TensorNameMap:
705705
#############################################################################
706706

707707
MODEL_TENSOR.CONV_NEXT_DW: (
708-
"backbone.convnext.{bid}.dwconv", # outetts
708+
"backbone.convnext.{bid}.dwconv", # wavtokenizer
709709
),
710710

711711
MODEL_TENSOR.CONV_NEXT_NORM: (
712-
"backbone.convnext.{bid}.norm", # outetts
712+
"backbone.convnext.{bid}.norm", # wavtokenizer
713713
),
714714

715715
MODEL_TENSOR.CONV_NEXT_PW1: (
716-
"backbone.convnext.{bid}.pwconv1", # outetts
716+
"backbone.convnext.{bid}.pwconv1", # wavtokenizer
717717
),
718718

719719
MODEL_TENSOR.CONV_NEXT_PW2: (
720-
"backbone.convnext.{bid}.pwconv2", # outetts
720+
"backbone.convnext.{bid}.pwconv2", # wavtokenizer
721721
),
722722

723723
MODEL_TENSOR.CONV_NEXT_GAMMA: (
724-
"backbone.convnext.{bid}.gamma", # outetts
724+
"backbone.convnext.{bid}.gamma", # wavtokenizer
725725
),
726726

727727
MODEL_TENSOR.POS_NET_CONV1: (
728-
"backbone.pos_net.{bid}.conv1", # outetts
728+
"backbone.pos_net.{bid}.conv1", # wavtokenizer
729729
),
730730

731731
MODEL_TENSOR.POS_NET_CONV2: (
732-
"backbone.pos_net.{bid}.conv2", # outetts
732+
"backbone.pos_net.{bid}.conv2", # wavtokenizer
733733
),
734734

735735
MODEL_TENSOR.POS_NET_NORM: (
736-
"backbone.pos_net.{bid}.norm", # outetts
736+
"backbone.pos_net.{bid}.norm", # wavtokenizer
737737
),
738738

739739
MODEL_TENSOR.POS_NET_NORM1: (
740-
"backbone.pos_net.{bid}.norm1", # outetts
740+
"backbone.pos_net.{bid}.norm1", # wavtokenizer
741741
),
742742

743743
MODEL_TENSOR.POS_NET_NORM2: (
744-
"backbone.pos_net.{bid}.norm2", # outetts
744+
"backbone.pos_net.{bid}.norm2", # wavtokenizer
745745
),
746746

747747
MODEL_TENSOR.POS_NET_ATTN_NORM: (
748-
"backbone.pos_net.{bid}.norm", # outetts
748+
"backbone.pos_net.{bid}.norm", # wavtokenizer
749749
),
750750

751751
MODEL_TENSOR.POS_NET_ATTN_Q: (
752-
"backbone.pos_net.{bid}.q", # outetts
752+
"backbone.pos_net.{bid}.q", # wavtokenizer
753753
),
754754

755755
MODEL_TENSOR.POS_NET_ATTN_K: (
756-
"backbone.pos_net.{bid}.k", # outetts
756+
"backbone.pos_net.{bid}.k", # wavtokenizer
757757
),
758758

759759
MODEL_TENSOR.POS_NET_ATTN_V: (
760-
"backbone.pos_net.{bid}.v", # outetts
760+
"backbone.pos_net.{bid}.v", # wavtokenizer
761761
),
762762

763763
MODEL_TENSOR.POS_NET_ATTN_OUT: (
764-
"backbone.pos_net.{bid}.proj_out", # outetts
764+
"backbone.pos_net.{bid}.proj_out", # wavtokenizer
765765
),
766766
}
767767

0 commit comments

Comments
 (0)