1
1
/**
2
2
* @typedef {import('unist').Point } Point
3
+ * @typedef {import('unist').Position } UnistPosition
4
+ * @typedef {import('unist').Parent } UnistParent
3
5
*
4
6
* @typedef {import('nlcst').Root } NlcstRoot
5
7
* @typedef {import('nlcst').Content } NlcstContent
6
8
* @typedef {import('nlcst').SentenceContent } NlcstSentenceContent
7
9
* @typedef {import('nlcst').WhiteSpace } NlcstWhiteSpace
8
- * @typedef {import('nlcst').Source } NlcstSource
9
- * @typedef {NlcstRoot|NlcstContent } NlcstNode
10
+ * @typedef {import('nlcst').Sentence } NlcstSentence
11
+ * @typedef {import('nlcst').Paragraph } NlcstParagraph
10
12
*
11
13
* @typedef {import('mdast').Root } MdastRoot
12
14
* @typedef {import('mdast').Content } MdastContent
13
- * @typedef {MdastRoot|MdastContent } MdastNode
14
- * @typedef {Extract<MdastNode, import('unist').Parent> } MdastParent
15
15
*
16
16
* @typedef {import('vfile').VFile } VFile
17
+ *
17
18
* @typedef {ReturnType<import('vfile-location').location> } Location
19
+ */
20
+
21
+ /**
22
+ * @typedef {MdastRoot | MdastContent } MdastNode
23
+ * @typedef {NlcstRoot | NlcstContent } NlcstNode
24
+ * @typedef {Extract<NlcstNode, UnistParent> } NlcstParent
25
+ * @typedef {Extract<MdastNode, UnistParent> } MdastParent
26
+ *
18
27
* @typedef {{
19
- * parse(nodes: Array<NlcstContent>): NlcstRoot
20
- * tokenizeSource(value: string): NlcstSource
21
- * tokenizeWhiteSpace(value: string): NlcstWhiteSpace
22
- * tokenize(value: string): Array<NlcstSentenceContent>
28
+ * tokenizeSentencePlugins: Array<(node: NlcstSentence) => void>,
29
+ * tokenizeParagraphPlugins: Array<(node: NlcstParagraph) => void>,
30
+ * tokenizeRootPlugins: Array<(node: NlcstRoot) => void>,
31
+ * parse(value: string | null | undefined): NlcstRoot
32
+ * tokenize(value: string | null | undefined): Array<NlcstSentenceContent>
23
33
* }} ParserInstance
24
34
* @typedef {new () => ParserInstance } ParserConstructor
25
35
*
@@ -44,6 +54,11 @@ import {location} from 'vfile-location'
44
54
const defaultIgnore = [ 'table' , 'tableRow' , 'tableCell' ]
45
55
const defaultSource = [ 'inlineCode' ]
46
56
57
+ // Ported from:
58
+ // <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/expressions.js#L5>
59
+ const newLine = / ^ [ \t ] * ( ( \r ? \n | \r ) [ \t ] * ) + $ /
60
+ const terminalMarker = / ^ ( [ ! . ? \u2026 \u203D ] + ) $ /
61
+
47
62
/**
48
63
* Transform a `tree` in mdast to nlcst.
49
64
*
@@ -52,6 +67,7 @@ const defaultSource = ['inlineCode']
52
67
* @param {ParserInstance|ParserConstructor } Parser
53
68
* @param {Options } [options]
54
69
*/
70
+ // eslint-disable-next-line complexity
55
71
export function toNlcst ( tree , file , Parser , options = { } ) {
56
72
// Crash on invalid parameters.
57
73
if ( ! tree || ! tree . type ) {
@@ -78,31 +94,78 @@ export function toNlcst(tree, file, Parser, options = {}) {
78
94
79
95
const parser = 'parse' in Parser ? Parser : new Parser ( )
80
96
81
- const result = one (
82
- {
83
- doc : String ( file ) ,
84
- place : location ( file ) ,
85
- parser,
86
- ignore : options . ignore
87
- ? defaultIgnore . concat ( options . ignore )
88
- : defaultIgnore ,
89
- source : options . source
90
- ? defaultSource . concat ( options . source )
91
- : defaultSource
92
- } ,
93
- tree
94
- )
95
-
96
- // Transform mdast into nlcst tokens, and pass these into `parser.parse` to
97
- // insert sentences, paragraphs where needed.
98
- return parser . parse ( result || [ ] )
97
+ /** @type {Context } */
98
+ const context = {
99
+ doc : String ( file ) ,
100
+ place : location ( file ) ,
101
+ parser,
102
+ ignore : options . ignore
103
+ ? defaultIgnore . concat ( options . ignore )
104
+ : defaultIgnore ,
105
+ source : options . source
106
+ ? defaultSource . concat ( options . source )
107
+ : defaultSource
108
+ }
109
+
110
+ const result = one ( context , tree )
111
+
112
+ if ( result && result . length > 0 ) {
113
+ const start = pointStart ( result [ 0 ] )
114
+ const end = pointEnd ( result [ result . length - 1 ] )
115
+
116
+ // Turn into a sentence.
117
+ /** @type {NlcstSentence } */
118
+ const sentence = { type : 'SentenceNode' , children : result }
119
+
120
+ if ( start && start . line && end && end . line ) {
121
+ sentence . position = { start, end}
122
+ }
123
+
124
+ let index = - 1
125
+ while ( parser . tokenizeSentencePlugins [ ++ index ] ) {
126
+ parser . tokenizeSentencePlugins [ index ] ( sentence )
127
+ }
128
+
129
+ // Turn into a paragraph.
130
+ /** @type {NlcstParagraph } */
131
+ const paragraph = {
132
+ type : 'ParagraphNode' ,
133
+ children : splitNode ( sentence , 'PunctuationNode' , terminalMarker )
134
+ }
135
+ if ( start && start . line && end && end . line ) {
136
+ paragraph . position = { start : { ...start } , end : { ...end } }
137
+ }
138
+
139
+ index = - 1
140
+ while ( parser . tokenizeParagraphPlugins [ ++ index ] ) {
141
+ parser . tokenizeParagraphPlugins [ index ] ( paragraph )
142
+ }
143
+
144
+ /** @type {NlcstRoot } */
145
+ const root = {
146
+ type : 'RootNode' ,
147
+ children : splitNode ( paragraph , 'WhiteSpaceNode' , newLine )
148
+ }
149
+ if ( start && start . line && end && end . line ) {
150
+ root . position = { start : { ...start } , end : { ...end } }
151
+ }
152
+
153
+ index = - 1
154
+ while ( parser . tokenizeRootPlugins [ ++ index ] ) {
155
+ parser . tokenizeRootPlugins [ index ] ( root )
156
+ }
157
+
158
+ return root
159
+ }
160
+
161
+ return { type : 'RootNode' , children : [ ] }
99
162
}
100
163
101
164
/**
102
165
* Transform a single node.
103
166
* @param {Context } config
104
167
* @param {MdastNode } node
105
- * @returns {Array<NlcstContent >|undefined }
168
+ * @returns {Array<NlcstSentenceContent >|undefined }
106
169
*/
107
170
function one ( config , node ) {
108
171
const start = node . position ? node . position . start . offset : undefined
@@ -112,9 +175,10 @@ function one(config, node) {
112
175
return patch (
113
176
config ,
114
177
[
115
- config . parser . tokenizeSource (
116
- config . doc . slice ( start , node . position . end . offset )
117
- )
178
+ {
179
+ type : 'SourceNode' ,
180
+ value : config . doc . slice ( start , node . position . end . offset )
181
+ }
118
182
] ,
119
183
start
120
184
)
@@ -133,7 +197,7 @@ function one(config, node) {
133
197
}
134
198
135
199
if ( node . type === 'break' ) {
136
- return patch ( config , [ config . parser . tokenizeWhiteSpace ( ' \n') ] , start )
200
+ return patch ( config , [ { type : 'WhiteSpaceNode' , value : ' \n'} ] , start )
137
201
}
138
202
139
203
if ( node . type === 'text' ) {
@@ -146,11 +210,11 @@ function one(config, node) {
146
210
* Transform all nodes in `parent`.
147
211
* @param {Context } config
148
212
* @param {MdastParent } parent
149
- * @returns {Array<NlcstContent > }
213
+ * @returns {Array<NlcstSentenceContent > }
150
214
*/
151
215
function all ( config , parent ) {
152
216
let index = - 1
153
- /** @type {Array<NlcstContent > } */
217
+ /** @type {Array<NlcstSentenceContent > } */
154
218
const results = [ ]
155
219
/** @type {Point|undefined } */
156
220
let end
@@ -165,9 +229,11 @@ function all(config, parent) {
165
229
start . line !== null &&
166
230
start . line !== end . line
167
231
) {
168
- const lineEnding = config . parser . tokenizeWhiteSpace (
169
- '\n' . repeat ( start . line - end . line )
170
- )
232
+ /** @type {NlcstWhiteSpace } */
233
+ const lineEnding = {
234
+ type : 'WhiteSpaceNode' ,
235
+ value : '\n' . repeat ( start . line - end . line )
236
+ }
171
237
patch ( config , [ lineEnding ] , end . offset )
172
238
173
239
if ( lineEnding . value . length < 2 ) {
@@ -222,3 +288,51 @@ function patch(config, nodes, offset) {
222
288
223
289
return nodes
224
290
}
291
+
292
+ // Ported from:
293
+ // <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/index.js#L266-L310>
294
+ /**
295
+ * A function that splits one node into several nodes.
296
+ *
297
+ * @template {NlcstParent} TheNode
298
+ * @param {TheNode } node
299
+ * @param {RegExp } expression
300
+ * @param {NlcstContent['type'] } childType
301
+ * @returns {Array<TheNode> }
302
+ */
303
+ function splitNode ( node , childType , expression ) {
304
+ /** @type {Array<TheNode> } */
305
+ const result = [ ]
306
+ let index = - 1
307
+ let start = 0
308
+
309
+ while ( ++ index < node . children . length ) {
310
+ const token = node . children [ index ]
311
+
312
+ if (
313
+ index === node . children . length - 1 ||
314
+ ( token . type === childType && expression . test ( toString ( token ) ) )
315
+ ) {
316
+ /** @type {TheNode } */
317
+ // @ts -expect-error: fine
318
+ const parent = {
319
+ type : node . type ,
320
+ children : node . children . slice ( start , index + 1 )
321
+ }
322
+
323
+ const first = node . children [ start ]
324
+ const last = token
325
+ if ( first . position && last . position ) {
326
+ parent . position = {
327
+ start : first . position . start ,
328
+ end : last . position . end
329
+ }
330
+ }
331
+
332
+ result . push ( parent )
333
+ start = index + 1
334
+ }
335
+ }
336
+
337
+ return result
338
+ }
0 commit comments