|
1 |
| -import sax from 'sax' |
2 |
| -import Message from 'vfile-message' |
3 |
| - |
4 |
| -var Parser = sax.SAXParser |
5 |
| - |
6 |
| -var fromCharCode = String.fromCharCode |
7 |
| - |
8 |
| -var search = /\r?\n|\r/g |
9 |
| - |
10 |
| -export function fromXml(doc) { |
11 |
| - var parser = new Parser(true, {position: true, strictEntities: true}) |
12 |
| - var stack = [{type: 'root', children: []}] |
13 |
| - var position = now() |
14 |
| - |
15 |
| - parser.ondoctype = ondoctype |
16 |
| - parser.onsgmldeclaration = onsgmldeclaration |
17 |
| - parser.onprocessinginstruction = onprocessinginstruction |
18 |
| - parser.ontext = ontext |
19 |
| - parser.oncomment = oncomment |
20 |
| - parser.onopencdata = oncdataopen |
21 |
| - parser.oncdata = oncdatavalue |
22 |
| - parser.onclosecdata = exit |
23 |
| - parser.onopentag = onopen |
24 |
| - parser.onclosetag = exit |
25 |
| - parser.onerror = onerror |
26 |
| - |
27 |
| - parser.write(doc).close() |
28 |
| - |
29 |
| - return stack[0] |
30 |
| - |
31 |
| - function onerror(error) { |
32 |
| - var index = error.message.indexOf('\nLine') |
33 |
| - // The substring should always be included, but this guards against |
34 |
| - // changes in newer sax versions. |
35 |
| - /* c8 ignore next */ |
36 |
| - fail(index === -1 ? error.message : error.message.slice(0, index), 'sax') |
37 |
| - } |
38 |
| - |
39 |
| - function onsgmldeclaration() { |
40 |
| - fail('Unexpected SGML declaration', 'unexpected-sgml') |
41 |
| - } |
42 |
| - |
43 |
| - // eslint-disable-next-line complexity |
44 |
| - function ondoctype(value) { |
45 |
| - var node = {type: 'doctype', name: '', public: null, system: null} |
46 |
| - var index = -1 |
47 |
| - var state = 'BEGIN' |
48 |
| - var returnState |
49 |
| - var buffer |
50 |
| - var bufferIndex |
51 |
| - var start |
52 |
| - var marker |
53 |
| - var code |
54 |
| - |
55 |
| - while (++index <= value.length) { |
56 |
| - code = index === value.length ? null /* EOF */ : value.charCodeAt(index) |
57 |
| - |
58 |
| - switch (state) { |
59 |
| - case 'BEGIN': |
60 |
| - if (isSpace(code)) { |
61 |
| - state = 'BEFORE_NAME' |
62 |
| - } else { |
63 |
| - fail('Expected doctype name', 'doctype-name') |
64 |
| - } |
65 |
| - |
66 |
| - break |
67 |
| - case 'BEFORE_NAME': |
68 |
| - if (isSpace(code)) { |
69 |
| - // As expected. |
70 |
| - } else if (isNameStartChar(code)) { |
71 |
| - state = 'IN_NAME' |
72 |
| - start = index |
73 |
| - } else { |
74 |
| - fail('Expected start of doctype name', 'doctype-name') |
75 |
| - } |
76 |
| - |
77 |
| - break |
78 |
| - case 'IN_NAME': |
79 |
| - if (isNameChar(code)) { |
80 |
| - // As expected. |
81 |
| - } else if (isSpace(code) || code === null /* EOF */) { |
82 |
| - state = 'AFTER_NAME' |
83 |
| - node.name = value.slice(start, index) |
84 |
| - } else if (code === 91 /* `[` */) { |
85 |
| - fail('Unexpected internal subset', 'doctype-internal-subset') |
86 |
| - } else { |
87 |
| - fail( |
88 |
| - 'Expected doctype name character, whitespace, or doctype end', |
89 |
| - 'doctype-name' |
90 |
| - ) |
91 |
| - } |
92 |
| - |
93 |
| - break |
94 |
| - case 'AFTER_NAME': |
95 |
| - if (code === null /* EOF */) { |
96 |
| - // Done. |
97 |
| - } else if (isSpace(code)) { |
98 |
| - // As expected. |
99 |
| - } else |
100 |
| - switch (code) { |
101 |
| - case 80: { |
102 |
| - state = 'IN_EID' |
103 |
| - returnState = 'AFTER_PUBLIC' |
104 |
| - buffer = 'PUBLIC' |
105 |
| - bufferIndex = 0 |
106 |
| - |
107 |
| - break |
108 |
| - } |
109 |
| - |
110 |
| - case 83: { |
111 |
| - state = 'IN_EID' |
112 |
| - returnState = 'AFTER_SYSTEM' |
113 |
| - buffer = 'SYSTEM' |
114 |
| - bufferIndex = 0 |
115 |
| - |
116 |
| - break |
117 |
| - } |
118 |
| - |
119 |
| - case 91: { |
120 |
| - fail('Unexpected internal subset', 'doctype-internal-subset') |
121 |
| - |
122 |
| - break |
123 |
| - } |
124 |
| - |
125 |
| - default: { |
126 |
| - fail( |
127 |
| - 'Expected external identifier (`PUBLIC` or `SYSTEM`), whitespace, or doctype end', |
128 |
| - 'doctype-external-identifier' |
129 |
| - ) |
130 |
| - } |
131 |
| - } |
132 |
| - |
133 |
| - break |
134 |
| - case 'IN_EID': |
135 |
| - if (code === buffer.charCodeAt(++bufferIndex)) { |
136 |
| - if (bufferIndex === buffer.length - 1) { |
137 |
| - state = returnState |
138 |
| - } |
139 |
| - } else { |
140 |
| - fail( |
141 |
| - 'Expected external identifier (`PUBLIC` or `SYSTEM`)', |
142 |
| - 'doctype-external-identifier' |
143 |
| - ) |
144 |
| - } |
145 |
| - |
146 |
| - break |
147 |
| - case 'AFTER_PUBLIC': |
148 |
| - if (isSpace(code)) { |
149 |
| - state = 'BEFORE_PUBLIC_LITERAL' |
150 |
| - } else { |
151 |
| - fail('Expected whitespace after `PUBLIC`', 'doctype-public-literal') |
152 |
| - } |
153 |
| - |
154 |
| - break |
155 |
| - case 'AFTER_SYSTEM': |
156 |
| - if (isSpace(code)) { |
157 |
| - state = 'BEFORE_SYSTEM_LITERAL' |
158 |
| - } else { |
159 |
| - fail('Expected whitespace after `SYSTEM`', 'doctype-system-literal') |
160 |
| - } |
161 |
| - |
162 |
| - break |
163 |
| - case 'BEFORE_PUBLIC_LITERAL': |
164 |
| - if (isSpace(code)) { |
165 |
| - // As expected. |
166 |
| - } else if (code === 34 /* `"` */ || code === 39 /* `'` */) { |
167 |
| - state = 'IN_PUBLIC_LITERAL' |
168 |
| - start = index + 1 |
169 |
| - marker = code |
170 |
| - } else { |
171 |
| - fail( |
172 |
| - 'Expected quote or apostrophe to start public literal', |
173 |
| - 'doctype-public-literal' |
174 |
| - ) |
175 |
| - } |
176 |
| - |
177 |
| - break |
178 |
| - case 'IN_PUBLIC_LITERAL': |
179 |
| - if (code === marker) { |
180 |
| - state = 'AFTER_PUBLIC_LITERAL' |
181 |
| - node.public = value.slice(start, index) |
182 |
| - } else if (isPubidChar(code)) { |
183 |
| - // As expected. |
184 |
| - } else { |
185 |
| - fail( |
186 |
| - 'Expected pubid character in public literal', |
187 |
| - 'doctype-public-literal' |
188 |
| - ) |
189 |
| - } |
190 |
| - |
191 |
| - break |
192 |
| - case 'AFTER_PUBLIC_LITERAL': |
193 |
| - if (isSpace(code)) { |
194 |
| - // As expected. |
195 |
| - state = 'BEFORE_SYSTEM_LITERAL' |
196 |
| - } else { |
197 |
| - fail( |
198 |
| - 'Expected whitespace after public literal', |
199 |
| - 'doctype-system-literal' |
200 |
| - ) |
201 |
| - } |
202 |
| - |
203 |
| - break |
204 |
| - case 'BEFORE_SYSTEM_LITERAL': |
205 |
| - if (isSpace(code)) { |
206 |
| - // As expected. |
207 |
| - } else if (code === 34 /* `"` */ || code === 39 /* `'` */) { |
208 |
| - state = 'IN_SYSTEM_LITERAL' |
209 |
| - start = index + 1 |
210 |
| - marker = code |
211 |
| - } else { |
212 |
| - fail( |
213 |
| - 'Expected quote or apostrophe to start system literal', |
214 |
| - 'doctype-system-literal' |
215 |
| - ) |
216 |
| - } |
217 |
| - |
218 |
| - break |
219 |
| - case 'IN_SYSTEM_LITERAL': |
220 |
| - // Handled by SAX, but keep it to guard against changes in newer sax |
221 |
| - // versions. |
222 |
| - /* c8 ignore next 5 */ |
223 |
| - if (code === null /* EOF */) { |
224 |
| - fail( |
225 |
| - 'Expected quote or apostrophe to end system literal', |
226 |
| - 'doctype-system-literal' |
227 |
| - ) |
228 |
| - } else if (code === marker) { |
229 |
| - state = 'AFTER_SYSTEM_LITERAL' |
230 |
| - node.system = value.slice(start, index) |
231 |
| - } else { |
232 |
| - // As expected. |
233 |
| - } |
234 |
| - |
235 |
| - break |
236 |
| - |
237 |
| - case 'AFTER_SYSTEM_LITERAL': |
238 |
| - if (code === null /* EOF */) { |
239 |
| - // Done. |
240 |
| - } else if (isSpace(code)) { |
241 |
| - // As expected. |
242 |
| - } else if (code === 91 /* `[` */) { |
243 |
| - fail('Unexpected internal subset', 'internal-subset') |
244 |
| - } else { |
245 |
| - fail('Expected whitespace or end of doctype', 'system-literal') |
246 |
| - } |
247 |
| - |
248 |
| - break |
249 |
| - // Guard against new states. |
250 |
| - /* c8 ignore next 2 */ |
251 |
| - default: |
252 |
| - throw new Error('Unhandled state `' + state + '`') |
253 |
| - } |
254 |
| - } |
255 |
| - |
256 |
| - enter(node) |
257 |
| - exit() |
258 |
| - } |
259 |
| - |
260 |
| - function onprocessinginstruction(value) { |
261 |
| - enter({ |
262 |
| - type: 'instruction', |
263 |
| - name: String(value.name), |
264 |
| - value: String(value.body) |
265 |
| - }) |
266 |
| - exit() |
267 |
| - } |
268 |
| - |
269 |
| - function oncomment(value) { |
270 |
| - var node = {type: 'comment', value} |
271 |
| - |
272 |
| - // Comment has a positional bug… 😢 |
273 |
| - // They end right before the last character (`>`), so let’s add that: |
274 |
| - var actualEnd = now() |
275 |
| - actualEnd.column++ |
276 |
| - actualEnd.offset++ |
277 |
| - |
278 |
| - enter(node) |
279 |
| - exit() |
280 |
| - |
281 |
| - node.position.end = Object.assign({}, actualEnd) |
282 |
| - position = actualEnd |
283 |
| - } |
284 |
| - |
285 |
| - function oncdataopen() { |
286 |
| - enter({type: 'cdata', value: ''}) |
287 |
| - } |
288 |
| - |
289 |
| - function oncdatavalue(value) { |
290 |
| - stack[stack.length - 1].value += value |
291 |
| - } |
292 |
| - |
293 |
| - function ontext(value) { |
294 |
| - var node = {type: 'text', value} |
295 |
| - // Text has a positional bug… 😢 |
296 |
| - // When they are added, the position is already at the next token. |
297 |
| - // So let’s reverse that. |
298 |
| - var actualEnd = Object.assign({}, position) |
299 |
| - var start = 0 |
300 |
| - var match |
301 |
| - |
302 |
| - while (start < value.length) { |
303 |
| - search.lastIndex = start |
304 |
| - match = search.exec(value) |
305 |
| - |
306 |
| - if (match) { |
307 |
| - actualEnd.line++ |
308 |
| - actualEnd.column = 1 |
309 |
| - start = match.index + match[0].length |
310 |
| - } else { |
311 |
| - actualEnd.column += value.length - start |
312 |
| - start = value.length |
313 |
| - } |
314 |
| - } |
315 |
| - |
316 |
| - actualEnd.offset += value.length |
317 |
| - |
318 |
| - enter(node) |
319 |
| - exit() |
320 |
| - |
321 |
| - node.position.end = Object.assign({}, actualEnd) |
322 |
| - position = actualEnd |
323 |
| - } |
324 |
| - |
325 |
| - function onopen(value) { |
326 |
| - enter({ |
327 |
| - type: 'element', |
328 |
| - name: value.name, |
329 |
| - attributes: value.attributes, |
330 |
| - children: [] |
331 |
| - }) |
332 |
| - } |
333 |
| - |
334 |
| - function enter(node) { |
335 |
| - node.position = {start: Object.assign({}, position)} |
336 |
| - stack[stack.length - 1].children.push(node) |
337 |
| - stack.push(node) |
338 |
| - position = now() |
339 |
| - } |
340 |
| - |
341 |
| - function exit() { |
342 |
| - position = now() |
343 |
| - stack.pop().position.end = Object.assign({}, position) |
344 |
| - } |
345 |
| - |
346 |
| - function now() { |
347 |
| - return { |
348 |
| - line: parser.line + 1, |
349 |
| - column: parser.column + 1, |
350 |
| - offset: parser.position |
351 |
| - } |
352 |
| - } |
353 |
| - |
354 |
| - function fail(reason, id) { |
355 |
| - throw new Message(reason, now(), 'xast-util-from-xml:' + id) |
356 |
| - } |
357 |
| -} |
358 |
| - |
359 |
| -// See: <https://www.w3.org/TR/xml/#NT-NameStartChar> |
360 |
| -function isNameStartChar(code) { |
361 |
| - return /[:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/.test( |
362 |
| - fromCharCode(code) |
363 |
| - ) |
364 |
| -} |
365 |
| - |
366 |
| -// See: <https://www.w3.org/TR/xml/#NT-NameChar> |
367 |
| -function isNameChar(code) { |
368 |
| - return ( |
369 |
| - isNameStartChar(code) || |
370 |
| - /[-.\d\u00B7\u0300-\u036F\u203F\u2040]/.test(fromCharCode(code)) |
371 |
| - ) |
372 |
| -} |
373 |
| - |
374 |
| -function isSpace(code) { |
375 |
| - return /[\t\n\r ]/.test(fromCharCode(code)) |
376 |
| -} |
377 |
| - |
378 |
| -function isPubidChar(code) { |
379 |
| - return /[\n\r !#$%'-;=?-Z_a-z]/.test(fromCharCode(code)) |
380 |
| -} |
| 1 | +export {fromXml} from './lib/index.js' |
0 commit comments