Skip to content

Commit f947fba

Browse files
committed
Add support for new parse-latin, parse-english
This is a breaking change: these parsers were updated, which cleans their API a lot, and adds types.
1 parent 66a1962 commit f947fba

File tree

3 files changed

+153
-42
lines changed

3 files changed

+153
-42
lines changed

lib/index.js

Lines changed: 150 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,35 @@
11
/**
22
* @typedef {import('unist').Point} Point
3+
* @typedef {import('unist').Position} UnistPosition
4+
* @typedef {import('unist').Parent} UnistParent
35
*
46
* @typedef {import('nlcst').Root} NlcstRoot
57
* @typedef {import('nlcst').Content} NlcstContent
68
* @typedef {import('nlcst').SentenceContent} NlcstSentenceContent
79
* @typedef {import('nlcst').WhiteSpace} NlcstWhiteSpace
8-
* @typedef {import('nlcst').Source} NlcstSource
9-
* @typedef {NlcstRoot|NlcstContent} NlcstNode
10+
* @typedef {import('nlcst').Sentence} NlcstSentence
11+
* @typedef {import('nlcst').Paragraph} NlcstParagraph
1012
*
1113
* @typedef {import('mdast').Root} MdastRoot
1214
* @typedef {import('mdast').Content} MdastContent
13-
* @typedef {MdastRoot|MdastContent} MdastNode
14-
* @typedef {Extract<MdastNode, import('unist').Parent>} MdastParent
1515
*
1616
* @typedef {import('vfile').VFile} VFile
17+
*
1718
* @typedef {ReturnType<import('vfile-location').location>} Location
19+
*/
20+
21+
/**
22+
* @typedef {MdastRoot | MdastContent} MdastNode
23+
* @typedef {NlcstRoot | NlcstContent} NlcstNode
24+
* @typedef {Extract<NlcstNode, UnistParent>} NlcstParent
25+
* @typedef {Extract<MdastNode, UnistParent>} MdastParent
26+
*
1827
* @typedef {{
19-
* parse(nodes: Array<NlcstContent>): NlcstRoot
20-
* tokenizeSource(value: string): NlcstSource
21-
* tokenizeWhiteSpace(value: string): NlcstWhiteSpace
22-
* tokenize(value: string): Array<NlcstSentenceContent>
28+
* tokenizeSentencePlugins: Array<(node: NlcstSentence) => void>,
29+
* tokenizeParagraphPlugins: Array<(node: NlcstParagraph) => void>,
30+
* tokenizeRootPlugins: Array<(node: NlcstRoot) => void>,
31+
* parse(value: string | null | undefined): NlcstRoot
32+
* tokenize(value: string | null | undefined): Array<NlcstSentenceContent>
2333
* }} ParserInstance
2434
* @typedef {new () => ParserInstance} ParserConstructor
2535
*
@@ -44,6 +54,11 @@ import {location} from 'vfile-location'
4454
const defaultIgnore = ['table', 'tableRow', 'tableCell']
4555
const defaultSource = ['inlineCode']
4656

57+
// Ported from:
58+
// <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/expressions.js#L5>
59+
const newLine = /^[ \t]*((\r?\n|\r)[\t ]*)+$/
60+
const terminalMarker = /^([!.?\u2026\u203D]+)$/
61+
4762
/**
4863
* Transform a `tree` in mdast to nlcst.
4964
*
@@ -52,6 +67,7 @@ const defaultSource = ['inlineCode']
5267
* @param {ParserInstance|ParserConstructor} Parser
5368
* @param {Options} [options]
5469
*/
70+
// eslint-disable-next-line complexity
5571
export function toNlcst(tree, file, Parser, options = {}) {
5672
// Crash on invalid parameters.
5773
if (!tree || !tree.type) {
@@ -78,31 +94,78 @@ export function toNlcst(tree, file, Parser, options = {}) {
7894

7995
const parser = 'parse' in Parser ? Parser : new Parser()
8096

81-
const result = one(
82-
{
83-
doc: String(file),
84-
place: location(file),
85-
parser,
86-
ignore: options.ignore
87-
? defaultIgnore.concat(options.ignore)
88-
: defaultIgnore,
89-
source: options.source
90-
? defaultSource.concat(options.source)
91-
: defaultSource
92-
},
93-
tree
94-
)
95-
96-
// Transform mdast into nlcst tokens, and pass these into `parser.parse` to
97-
// insert sentences, paragraphs where needed.
98-
return parser.parse(result || [])
97+
/** @type {Context} */
98+
const context = {
99+
doc: String(file),
100+
place: location(file),
101+
parser,
102+
ignore: options.ignore
103+
? defaultIgnore.concat(options.ignore)
104+
: defaultIgnore,
105+
source: options.source
106+
? defaultSource.concat(options.source)
107+
: defaultSource
108+
}
109+
110+
const result = one(context, tree)
111+
112+
if (result && result.length > 0) {
113+
const start = pointStart(result[0])
114+
const end = pointEnd(result[result.length - 1])
115+
116+
// Turn into a sentence.
117+
/** @type {NlcstSentence} */
118+
const sentence = {type: 'SentenceNode', children: result}
119+
120+
if (start && start.line && end && end.line) {
121+
sentence.position = {start, end}
122+
}
123+
124+
let index = -1
125+
while (parser.tokenizeSentencePlugins[++index]) {
126+
parser.tokenizeSentencePlugins[index](sentence)
127+
}
128+
129+
// Turn into a paragraph.
130+
/** @type {NlcstParagraph} */
131+
const paragraph = {
132+
type: 'ParagraphNode',
133+
children: splitNode(sentence, 'PunctuationNode', terminalMarker)
134+
}
135+
if (start && start.line && end && end.line) {
136+
paragraph.position = {start: {...start}, end: {...end}}
137+
}
138+
139+
index = -1
140+
while (parser.tokenizeParagraphPlugins[++index]) {
141+
parser.tokenizeParagraphPlugins[index](paragraph)
142+
}
143+
144+
/** @type {NlcstRoot} */
145+
const root = {
146+
type: 'RootNode',
147+
children: splitNode(paragraph, 'WhiteSpaceNode', newLine)
148+
}
149+
if (start && start.line && end && end.line) {
150+
root.position = {start: {...start}, end: {...end}}
151+
}
152+
153+
index = -1
154+
while (parser.tokenizeRootPlugins[++index]) {
155+
parser.tokenizeRootPlugins[index](root)
156+
}
157+
158+
return root
159+
}
160+
161+
return {type: 'RootNode', children: []}
99162
}
100163

101164
/**
102165
* Transform a single node.
103166
* @param {Context} config
104167
* @param {MdastNode} node
105-
* @returns {Array<NlcstContent>|undefined}
168+
* @returns {Array<NlcstSentenceContent>|undefined}
106169
*/
107170
function one(config, node) {
108171
const start = node.position ? node.position.start.offset : undefined
@@ -112,9 +175,10 @@ function one(config, node) {
112175
return patch(
113176
config,
114177
[
115-
config.parser.tokenizeSource(
116-
config.doc.slice(start, node.position.end.offset)
117-
)
178+
{
179+
type: 'SourceNode',
180+
value: config.doc.slice(start, node.position.end.offset)
181+
}
118182
],
119183
start
120184
)
@@ -133,7 +197,7 @@ function one(config, node) {
133197
}
134198

135199
if (node.type === 'break') {
136-
return patch(config, [config.parser.tokenizeWhiteSpace('\n')], start)
200+
return patch(config, [{type: 'WhiteSpaceNode', value: '\n'}], start)
137201
}
138202

139203
if (node.type === 'text') {
@@ -146,11 +210,11 @@ function one(config, node) {
146210
* Transform all nodes in `parent`.
147211
* @param {Context} config
148212
* @param {MdastParent} parent
149-
* @returns {Array<NlcstContent>}
213+
* @returns {Array<NlcstSentenceContent>}
150214
*/
151215
function all(config, parent) {
152216
let index = -1
153-
/** @type {Array<NlcstContent>} */
217+
/** @type {Array<NlcstSentenceContent>} */
154218
const results = []
155219
/** @type {Point|undefined} */
156220
let end
@@ -165,9 +229,11 @@ function all(config, parent) {
165229
start.line !== null &&
166230
start.line !== end.line
167231
) {
168-
const lineEnding = config.parser.tokenizeWhiteSpace(
169-
'\n'.repeat(start.line - end.line)
170-
)
232+
/** @type {NlcstWhiteSpace} */
233+
const lineEnding = {
234+
type: 'WhiteSpaceNode',
235+
value: '\n'.repeat(start.line - end.line)
236+
}
171237
patch(config, [lineEnding], end.offset)
172238

173239
if (lineEnding.value.length < 2) {
@@ -222,3 +288,51 @@ function patch(config, nodes, offset) {
222288

223289
return nodes
224290
}
291+
292+
// Ported from:
293+
// <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/index.js#L266-L310>
294+
/**
295+
* A function that splits one node into several nodes.
296+
*
297+
* @template {NlcstParent} TheNode
298+
* @param {TheNode} node
299+
* @param {RegExp} expression
300+
* @param {NlcstContent['type']} childType
301+
* @returns {Array<TheNode>}
302+
*/
303+
function splitNode(node, childType, expression) {
304+
/** @type {Array<TheNode>} */
305+
const result = []
306+
let index = -1
307+
let start = 0
308+
309+
while (++index < node.children.length) {
310+
const token = node.children[index]
311+
312+
if (
313+
index === node.children.length - 1 ||
314+
(token.type === childType && expression.test(toString(token)))
315+
) {
316+
/** @type {TheNode} */
317+
// @ts-expect-error: fine
318+
const parent = {
319+
type: node.type,
320+
children: node.children.slice(start, index + 1)
321+
}
322+
323+
const first = node.children[start]
324+
const last = token
325+
if (first.position && last.position) {
326+
parent.position = {
327+
start: first.position.start,
328+
end: last.position.end
329+
}
330+
}
331+
332+
result.push(parent)
333+
start = index + 1
334+
}
335+
}
336+
337+
return result
338+
}

package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,9 @@
4747
"@types/tape": "^4.0.0",
4848
"c8": "^7.0.0",
4949
"is-hidden": "^2.0.0",
50-
"parse-dutch": "^5.0.0",
51-
"parse-english": "^5.0.0",
52-
"parse-latin": "^5.0.0",
50+
"parse-dutch": "^6.0.0",
51+
"parse-english": "^6.0.0",
52+
"parse-latin": "^6.0.0",
5353
"prettier": "^2.0.0",
5454
"remark": "^14.0.0",
5555
"remark-cli": "^11.0.0",

test/index.js

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,8 @@ import {remark} from 'remark'
1414
import remarkGfm from 'remark-gfm'
1515
import remarkFrontmatter from 'remark-frontmatter'
1616
import {toVFile as vfile} from 'to-vfile'
17-
// @ts-expect-error: to do type.
1817
import {ParseLatin} from 'parse-latin'
19-
// @ts-expect-error: to do type.
2018
import {ParseDutch} from 'parse-dutch'
21-
// @ts-expect-error: to do type.
2219
import {ParseEnglish} from 'parse-english'
2320
import {isHidden} from 'is-hidden'
2421
import {toNlcst} from '../index.js'

0 commit comments

Comments
 (0)