Skip to content

Commit 5fa8294

Browse files
compiladearthw
authored andcommitted
convert : handle tokenizer merges format from transformers 4.45 (ggml-org#9696)
1 parent 40aabf4 commit 5fa8294

File tree

1 file changed

+24
-2
lines changed

1 file changed

+24
-2
lines changed

gguf-py/gguf/vocab.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,30 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
122122
tokenizer = json.load(f)
123123
if self.load_merges:
124124
merges = tokenizer.get('model', {}).get('merges')
125-
if isinstance(merges, list) and merges and isinstance(merges[0], str):
126-
self.merges = merges
125+
if isinstance(merges, list) and merges:
126+
if isinstance(merges[0], str):
127+
self.merges = merges
128+
elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str):
129+
# New format since transformers 4.45 to support spaces in merges
130+
# ref: https://github.com/ggerganov/llama.cpp/issues/9692
131+
# TODO: internally store as the new format instead of converting to old
132+
if any(' ' in s for pair in merges for s in pair):
133+
logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}')
134+
self.merges = [
135+
' '.join(
136+
[
137+
# ensure the spaces are properly encoded
138+
''.join(
139+
chr(ord(c) + 256) if c == ' ' else c
140+
for c in part
141+
)
142+
for part in pair
143+
]
144+
)
145+
for pair in merges
146+
]
147+
else:
148+
raise ValueError("Unknown tokenizer merges format")
127149
added_tokens = tokenizer.get('added_tokens', {})
128150
else:
129151
added_tokens = {}

0 commit comments

Comments
 (0)