1
+ /*
2
+ * patch for luajit by CppCXY
3
+ */
4
+
5
+ #include <lua.h>
6
+ #include <lauxlib.h>
7
+ #include <string.h>
8
+
9
+ // Returns the number of characters in the UTF-8 string `s`
10
+ // that start between byte position `i` and `j` (both included).
11
+ // The default for `i` and `j` is to consider all characters in the string.
12
+ // For negative indices, it starts counting from the end of the string.
13
+ // If `lax` is true, the function returns the number of characters in the string,
14
+ // even if some of them are invalid.
15
+ // Invalid characters are always counted as one character.
16
+ // signature: utf8.len(s [, i [, j [, lax]]])
17
+ // signature (s, [i], [j], [lax])
18
+ int luajit_utf8_len (lua_State * L )
19
+ {
20
+ size_t len ;
21
+ const char * s = luaL_checklstring (L , 1 , & len );
22
+ lua_Integer i = luaL_optinteger (L , 2 , 1 );
23
+ lua_Integer j = luaL_optinteger (L , 3 , len );
24
+ int lax = lua_toboolean (L , 4 );
25
+
26
+ // Adjust negative indices
27
+ if (i < 0 )
28
+ i += len + 1 ;
29
+ if (j < 0 )
30
+ j += len + 1 ;
31
+
32
+ // Clamp indices to the string boundaries
33
+ if (i < 1 )
34
+ i = 1 ;
35
+ if (j > (lua_Integer )len )
36
+ j = len ;
37
+ if (i > j )
38
+ {
39
+ lua_pushinteger (L , 0 );
40
+ return 1 ;
41
+ }
42
+
43
+ size_t start = i - 1 ;
44
+ size_t end = j - 1 ;
45
+ size_t count = 0 ;
46
+
47
+ // Traverse the string to count characters
48
+ for (size_t p = start ; p <= end ;)
49
+ {
50
+ if ((s [p ] & 0xC0 ) != 0x80 )
51
+ {
52
+ count ++ ;
53
+ }
54
+ if (!lax && (s [p ] & 0xC0 ) == 0x80 )
55
+ {
56
+ // Invalid UTF-8 sequence
57
+ p ++ ;
58
+ continue ;
59
+ }
60
+ p ++ ;
61
+ }
62
+
63
+ lua_pushinteger (L , count );
64
+ return 1 ;
65
+ }
66
+
67
+ // signature (s, n, [i])
68
+ int luajit_utf8_offset (lua_State * L )
69
+ {
70
+ // Get the string and the integer n from the Lua stack
71
+ size_t len ;
72
+ const char * s = luaL_checklstring (L , 1 , & len );
73
+ lua_Integer n = luaL_checkinteger (L , 2 );
74
+ lua_Integer i = luaL_optinteger (L , 3 , 1 );
75
+
76
+ // Adjust the starting index to be 0-based
77
+ if (i < 1 )
78
+ i = 1 ;
79
+ size_t p = i - 1 ;
80
+
81
+ // Traverse the string to find the byte offset of the nth UTF-8 character
82
+ lua_Integer count = 0 ;
83
+ while (p < len )
84
+ {
85
+ // Check if the current byte is the start of a UTF-8 character
86
+ if ((s [p ] & 0xC0 ) != 0x80 )
87
+ {
88
+ count ++ ;
89
+ if (count == n )
90
+ {
91
+ lua_pushinteger (L , p + 1 ); // Lua uses 1-based indexing
92
+ return 1 ;
93
+ }
94
+ }
95
+ p ++ ;
96
+ }
97
+
98
+ // If we reach here, it means the nth character was not found
99
+ lua_pushnil (L );
100
+ }
101
+
102
+ // Receives zero or more integers,
103
+ // converts each one to its corresponding UTF-8 byte sequence and returns a string with the concatenation of
104
+ // all these sequences.
105
+ int luajit_utf8_char (lua_State * L )
106
+ {
107
+ int n = lua_gettop (L ); // Number of arguments
108
+ luaL_Buffer b ;
109
+ luaL_buffinit (L , & b );
110
+
111
+ for (int i = 1 ; i <= n ; i ++ )
112
+ {
113
+ lua_Integer code = luaL_checkinteger (L , i );
114
+ if (code < 0x80 )
115
+ {
116
+ // 1-byte sequence
117
+ luaL_addchar (& b , (char )code );
118
+ }
119
+ else if (code < 0x800 )
120
+ {
121
+ // 2-byte sequence
122
+ luaL_addchar (& b , (char )(0xC0 | (code >> 6 )));
123
+ luaL_addchar (& b , (char )(0x80 | (code & 0x3F )));
124
+ }
125
+ else if (code < 0x10000 )
126
+ {
127
+ // 3-byte sequence
128
+ luaL_addchar (& b , (char )(0xE0 | (code >> 12 )));
129
+ luaL_addchar (& b , (char )(0x80 | ((code >> 6 ) & 0x3F )));
130
+ luaL_addchar (& b , (char )(0x80 | (code & 0x3F )));
131
+ }
132
+ else if (code < 0x110000 )
133
+ {
134
+ // 4-byte sequence
135
+ luaL_addchar (& b , (char )(0xF0 | (code >> 18 )));
136
+ luaL_addchar (& b , (char )(0x80 | ((code >> 12 ) & 0x3F )));
137
+ luaL_addchar (& b , (char )(0x80 | ((code >> 6 ) & 0x3F )));
138
+ luaL_addchar (& b , (char )(0x80 | (code & 0x3F )));
139
+ }
140
+ else
141
+ {
142
+ return luaL_error (L , "invalid UTF-8 code point" );
143
+ }
144
+ }
145
+
146
+ luaL_pushresult (& b );
147
+ return 1 ;
148
+ }
149
+
150
+ // Helper function to decode a single UTF-8 character
151
+ static int luajit_utf8_decode (const char * s , int * len )
152
+ {
153
+ unsigned char c = s [0 ];
154
+ if (c < 0x80 )
155
+ {
156
+ * len = 1 ;
157
+ return c ;
158
+ }
159
+ else if (c < 0xE0 )
160
+ {
161
+ * len = 2 ;
162
+ return ((c & 0x1F ) << 6 ) | (s [1 ] & 0x3F );
163
+ }
164
+ else if (c < 0xF0 )
165
+ {
166
+ * len = 3 ;
167
+ return ((c & 0x0F ) << 12 ) | ((s [1 ] & 0x3F ) << 6 ) | (s [2 ] & 0x3F );
168
+ }
169
+ else
170
+ {
171
+ * len = 4 ;
172
+ return ((c & 0x07 ) << 18 ) | ((s [1 ] & 0x3F ) << 12 ) | ((s [2 ] & 0x3F ) << 6 ) | (s [3 ] & 0x3F );
173
+ }
174
+ }
175
+
176
+ // Returns the codepoints (as integers) from all characters
177
+ // in `s` that start between byte position `i` and `j` (both included).
178
+ // signature (s [i], [j], [lax]) -> multiple integer values
179
+ int luajit_utf8_codepoint (lua_State * L )
180
+ {
181
+ size_t len ;
182
+ const char * s = luaL_checklstring (L , 1 , & len );
183
+ lua_Integer i = luaL_optinteger (L , 2 , 1 );
184
+ lua_Integer j = luaL_optinteger (L , 3 , len );
185
+ int lax = lua_toboolean (L , 4 );
186
+
187
+ // Adjust negative indices
188
+ if (i < 0 )
189
+ i += len + 1 ;
190
+ if (j < 0 )
191
+ j += len + 1 ;
192
+
193
+ // Clamp indices to the string boundaries
194
+ if (i < 1 )
195
+ i = 1 ;
196
+ if (j > (lua_Integer )len )
197
+ j = len ;
198
+ if (i > j )
199
+ {
200
+ lua_pushnil (L );
201
+ return 1 ;
202
+ }
203
+
204
+ size_t pos = i - 1 ;
205
+ int char_len ;
206
+ int codepoint = luajit_utf8_decode (s + pos , & char_len );
207
+
208
+ if (!lax && (char_len == 1 && (s [pos ] & 0x80 ) != 0 ))
209
+ {
210
+ lua_pushnil (L );
211
+ return 1 ;
212
+ }
213
+
214
+ lua_pushinteger (L , codepoint ); // Push the first code point
215
+
216
+ if (i == j )
217
+ {
218
+ return 1 ; // Return the single code point
219
+ }
220
+
221
+ int count = 1 ;
222
+ pos += char_len ;
223
+ while (pos < (size_t )j )
224
+ {
225
+ codepoint = luajit_utf8_decode (s + pos , & char_len );
226
+ if (!lax && (char_len == 1 && (s [pos ] & 0x80 ) != 0 ))
227
+ {
228
+ lua_pushnil (L );
229
+ return 1 ;
230
+ }
231
+ lua_pushinteger (L , codepoint ); // Push the code point
232
+ count ++ ;
233
+ pos += char_len ;
234
+ }
235
+
236
+ return count ; // Return the number of code points
237
+ }
238
+
239
+ // Iterator function
240
+ static int utf8_codes_iter (lua_State * L , int lax )
241
+ {
242
+ size_t len ;
243
+ const char * s = luaL_checklstring (L , 1 , & len );
244
+ int pos = luaL_checkinteger (L , 2 );
245
+
246
+ if (pos >= (int )len )
247
+ {
248
+ return 0 ; // End of iteration
249
+ }
250
+
251
+ int char_len ;
252
+ int codepoint = luajit_utf8_decode (s + pos , & char_len );
253
+
254
+ if (!lax && (char_len == 1 && (s [pos ] & 0x80 ) != 0 ))
255
+ {
256
+ return luaL_error (L , "invalid UTF-8 byte sequence" );
257
+ }
258
+
259
+ lua_pushinteger (L , pos + 1 ); // Next position
260
+ lua_pushinteger (L , codepoint ); // Code point
261
+ return 2 ;
262
+ }
263
+
264
+ static int iter_codes_strict (lua_State * L )
265
+ {
266
+ return utf8_codes_iter (L , 0 );
267
+ }
268
+
269
+ static int iter_codes_lax (lua_State * L )
270
+ {
271
+ return utf8_codes_iter (L , 1 );
272
+ }
273
+
274
+ // Returns values so that the construction
275
+ // ```lua
276
+ // for p, c in utf8.codes(s) do
277
+ // body
278
+ // end
279
+ // ```
280
+ // will iterate over all UTF-8 characters in string s, with p being the position (in bytes) and c the code point of each character. It raises an error if it meets any invalid byte sequence.
281
+ // signature (s [, lax]) -> fun(s: string, p: integer):integer, integer
282
+ int luajit_utf8_codes (lua_State * L )
283
+ {
284
+ int lax = lua_toboolean (L , 2 );
285
+ const char * s = luaL_checkstring (L , 1 );
286
+ lua_pushcfunction (L , lax ? iter_codes_lax : iter_codes_strict );
287
+ lua_pushvalue (L , 1 );
288
+ lua_pushinteger (L , 0 );
289
+ return 3 ;
290
+ }
0 commit comments