File tree Expand file tree Collapse file tree 1 file changed +24
-2
lines changed Expand file tree Collapse file tree 1 file changed +24
-2
lines changed Original file line number Diff line number Diff line change @@ -122,8 +122,30 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
122
122
tokenizer = json .load (f )
123
123
if self .load_merges :
124
124
merges = tokenizer .get ('model' , {}).get ('merges' )
125
- if isinstance (merges , list ) and merges and isinstance (merges [0 ], str ):
126
- self .merges = merges
125
+ if isinstance (merges , list ) and merges :
126
+ if isinstance (merges [0 ], str ):
127
+ self .merges = merges
128
+ elif isinstance (merges [0 ], list ) and len (merges [0 ]) == 2 and isinstance (merges [0 ][0 ], str ):
129
+ # New format since transformers 4.45 to support spaces in merges
130
+ # ref: https://github.com/ggerganov/llama.cpp/issues/9692
131
+ # TODO: internally store as the new format instead of converting to old
132
+ if any (' ' in s for pair in merges for s in pair ):
133
+ logger .warning (f'Spaces in merges detected, encoding as { chr (ord (" " ) + 256 )!r} ' )
134
+ self .merges = [
135
+ ' ' .join (
136
+ [
137
+ # ensure the spaces are properly encoded
138
+ '' .join (
139
+ chr (ord (c ) + 256 ) if c == ' ' else c
140
+ for c in part
141
+ )
142
+ for part in pair
143
+ ]
144
+ )
145
+ for pair in merges
146
+ ]
147
+ else :
148
+ raise ValueError ("Unknown tokenizer merges format" )
127
149
added_tokens = tokenizer .get ('added_tokens' , {})
128
150
else :
129
151
added_tokens = {}
You can’t perform that action at this time.
0 commit comments