Add support for new parse-latin, parse-english

wooorm · wooorm · commit f947fbaa6791 · 2023-01-25T17:08:34.000+01:00
This is a breaking change: these parsers were updated,
which cleans their API a lot, and adds types.
diff --git a/lib/index.js b/lib/index.js
@@ -1,25 +1,35 @@
 /**
  * @typedef {import('unist').Point} Point
+ * @typedef {import('unist').Position} UnistPosition
+ * @typedef {import('unist').Parent} UnistParent
  *
  * @typedef {import('nlcst').Root} NlcstRoot
  * @typedef {import('nlcst').Content} NlcstContent
  * @typedef {import('nlcst').SentenceContent} NlcstSentenceContent
  * @typedef {import('nlcst').WhiteSpace} NlcstWhiteSpace
- * @typedef {import('nlcst').Source} NlcstSource
- * @typedef {NlcstRoot|NlcstContent} NlcstNode
+ * @typedef {import('nlcst').Sentence} NlcstSentence
+ * @typedef {import('nlcst').Paragraph} NlcstParagraph
  *
  * @typedef {import('mdast').Root} MdastRoot
  * @typedef {import('mdast').Content} MdastContent
- * @typedef {MdastRoot|MdastContent} MdastNode
- * @typedef {Extract<MdastNode, import('unist').Parent>} MdastParent
  *
  * @typedef {import('vfile').VFile} VFile
+ *
  * @typedef {ReturnType<import('vfile-location').location>} Location
+ */
+
+/**
+ * @typedef {MdastRoot | MdastContent} MdastNode
+ * @typedef {NlcstRoot | NlcstContent} NlcstNode
+ * @typedef {Extract<NlcstNode, UnistParent>} NlcstParent
+ * @typedef {Extract<MdastNode, UnistParent>} MdastParent
+ *
  * @typedef {{
- *   parse(nodes: Array<NlcstContent>): NlcstRoot
- *   tokenizeSource(value: string): NlcstSource
- *   tokenizeWhiteSpace(value: string): NlcstWhiteSpace
- *   tokenize(value: string): Array<NlcstSentenceContent>
+ *   tokenizeSentencePlugins: Array<(node: NlcstSentence) => void>,
+ *   tokenizeParagraphPlugins: Array<(node: NlcstParagraph) => void>,
+ *   tokenizeRootPlugins: Array<(node: NlcstRoot) => void>,
+ *   parse(value: string | null | undefined): NlcstRoot
+ *   tokenize(value: string | null | undefined): Array<NlcstSentenceContent>
  * }} ParserInstance
  * @typedef {new () => ParserInstance} ParserConstructor
  *
@@ -44,6 +54,11 @@ import {location} from 'vfile-location'
 const defaultIgnore = ['table', 'tableRow', 'tableCell']
 const defaultSource = ['inlineCode']
 
+// Ported from:
+// <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/expressions.js#L5>
+const newLine = /^[ \t]*((\r?\n|\r)[\t ]*)+$/
+const terminalMarker = /^([!.?\u2026\u203D]+)$/
+
 /**
  * Transform a `tree` in mdast to nlcst.
  *
@@ -52,6 +67,7 @@ const defaultSource = ['inlineCode']
  * @param {ParserInstance|ParserConstructor} Parser
  * @param {Options} [options]
  */
+// eslint-disable-next-line complexity
 export function toNlcst(tree, file, Parser, options = {}) {
   // Crash on invalid parameters.
   if (!tree || !tree.type) {
@@ -78,31 +94,78 @@ export function toNlcst(tree, file, Parser, options = {}) {
 
   const parser = 'parse' in Parser ? Parser : new Parser()
 
-  const result = one(
-    {
-      doc: String(file),
-      place: location(file),
-      parser,
-      ignore: options.ignore
-        ? defaultIgnore.concat(options.ignore)
-        : defaultIgnore,
-      source: options.source
-        ? defaultSource.concat(options.source)
-        : defaultSource
-    },
-    tree
-  )
-
-  // Transform mdast into nlcst tokens, and pass these into `parser.parse` to
-  // insert sentences, paragraphs where needed.
-  return parser.parse(result || [])
+  /** @type {Context} */
+  const context = {
+    doc: String(file),
+    place: location(file),
+    parser,
+    ignore: options.ignore
+      ? defaultIgnore.concat(options.ignore)
+      : defaultIgnore,
+    source: options.source
+      ? defaultSource.concat(options.source)
+      : defaultSource
+  }
+
+  const result = one(context, tree)
+
+  if (result && result.length > 0) {
+    const start = pointStart(result[0])
+    const end = pointEnd(result[result.length - 1])
+
+    // Turn into a sentence.
+    /** @type {NlcstSentence} */
+    const sentence = {type: 'SentenceNode', children: result}
+
+    if (start && start.line && end && end.line) {
+      sentence.position = {start, end}
+    }
+
+    let index = -1
+    while (parser.tokenizeSentencePlugins[++index]) {
+      parser.tokenizeSentencePlugins[index](sentence)
+    }
+
+    // Turn into a paragraph.
+    /** @type {NlcstParagraph} */
+    const paragraph = {
+      type: 'ParagraphNode',
+      children: splitNode(sentence, 'PunctuationNode', terminalMarker)
+    }
+    if (start && start.line && end && end.line) {
+      paragraph.position = {start: {...start}, end: {...end}}
+    }
+
+    index = -1
+    while (parser.tokenizeParagraphPlugins[++index]) {
+      parser.tokenizeParagraphPlugins[index](paragraph)
+    }
+
+    /** @type {NlcstRoot} */
+    const root = {
+      type: 'RootNode',
+      children: splitNode(paragraph, 'WhiteSpaceNode', newLine)
+    }
+    if (start && start.line && end && end.line) {
+      root.position = {start: {...start}, end: {...end}}
+    }
+
+    index = -1
+    while (parser.tokenizeRootPlugins[++index]) {
+      parser.tokenizeRootPlugins[index](root)
+    }
+
+    return root
+  }
+
+  return {type: 'RootNode', children: []}
 }
 
 /**
  * Transform a single node.
  * @param {Context} config
  * @param {MdastNode} node
- * @returns {Array<NlcstContent>|undefined}
+ * @returns {Array<NlcstSentenceContent>|undefined}
  */
 function one(config, node) {
   const start = node.position ? node.position.start.offset : undefined
@@ -112,9 +175,10 @@ function one(config, node) {
       return patch(
         config,
         [
-          config.parser.tokenizeSource(
-            config.doc.slice(start, node.position.end.offset)
-          )
+          {
+            type: 'SourceNode',
+            value: config.doc.slice(start, node.position.end.offset)
+          }
         ],
         start
       )
@@ -133,7 +197,7 @@ function one(config, node) {
     }
 
     if (node.type === 'break') {
-      return patch(config, [config.parser.tokenizeWhiteSpace('\n')], start)
+      return patch(config, [{type: 'WhiteSpaceNode', value: '\n'}], start)
     }
 
     if (node.type === 'text') {
@@ -146,11 +210,11 @@ function one(config, node) {
  * Transform all nodes in `parent`.
  * @param {Context} config
  * @param {MdastParent} parent
- * @returns {Array<NlcstContent>}
+ * @returns {Array<NlcstSentenceContent>}
  */
 function all(config, parent) {
   let index = -1
-  /** @type {Array<NlcstContent>} */
+  /** @type {Array<NlcstSentenceContent>} */
   const results = []
   /** @type {Point|undefined} */
   let end
@@ -165,9 +229,11 @@ function all(config, parent) {
       start.line !== null &&
       start.line !== end.line
     ) {
-      const lineEnding = config.parser.tokenizeWhiteSpace(
-        '\n'.repeat(start.line - end.line)
-      )
+      /** @type {NlcstWhiteSpace} */
+      const lineEnding = {
+        type: 'WhiteSpaceNode',
+        value: '\n'.repeat(start.line - end.line)
+      }
       patch(config, [lineEnding], end.offset)
 
       if (lineEnding.value.length < 2) {
@@ -222,3 +288,51 @@ function patch(config, nodes, offset) {
 
   return nodes
 }
+
+// Ported from:
+// <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/index.js#L266-L310>
+/**
+ * A function that splits one node into several nodes.
+ *
+ * @template {NlcstParent} TheNode
+ * @param {TheNode} node
+ * @param {RegExp} expression
+ * @param {NlcstContent['type']} childType
+ * @returns {Array<TheNode>}
+ */
+function splitNode(node, childType, expression) {
+  /** @type {Array<TheNode>} */
+  const result = []
+  let index = -1
+  let start = 0
+
+  while (++index < node.children.length) {
+    const token = node.children[index]
+
+    if (
+      index === node.children.length - 1 ||
+      (token.type === childType && expression.test(toString(token)))
+    ) {
+      /** @type {TheNode} */
+      // @ts-expect-error: fine
+      const parent = {
+        type: node.type,
+        children: node.children.slice(start, index + 1)
+      }
+
+      const first = node.children[start]
+      const last = token
+      if (first.position && last.position) {
+        parent.position = {
+          start: first.position.start,
+          end: last.position.end
+        }
+      }
+
+      result.push(parent)
+      start = index + 1
+    }
+  }
+
+  return result
+}
diff --git a/package.json b/package.json
@@ -47,9 +47,9 @@
     "@types/tape": "^4.0.0",
     "c8": "^7.0.0",
     "is-hidden": "^2.0.0",
-    "parse-dutch": "^5.0.0",
-    "parse-english": "^5.0.0",
-    "parse-latin": "^5.0.0",
+    "parse-dutch": "^6.0.0",
+    "parse-english": "^6.0.0",
+    "parse-latin": "^6.0.0",
     "prettier": "^2.0.0",
     "remark": "^14.0.0",
     "remark-cli": "^11.0.0",
diff --git a/test/index.js b/test/index.js
@@ -14,11 +14,8 @@ import {remark} from 'remark'
 import remarkGfm from 'remark-gfm'
 import remarkFrontmatter from 'remark-frontmatter'
 import {toVFile as vfile} from 'to-vfile'
-// @ts-expect-error: to do type.
 import {ParseLatin} from 'parse-latin'
-// @ts-expect-error: to do type.
 import {ParseDutch} from 'parse-dutch'
-// @ts-expect-error: to do type.
 import {ParseEnglish} from 'parse-english'
 import {isHidden} from 'is-hidden'
 import {toNlcst} from '../index.js'