parse-latin - npm Package Compare versions

index.d.ts

lib/expressions.d.ts

lib/index.d.ts

lib/plugin/break-implicit-sentences.d.ts

lib/plugin/make-final-white-space-siblings.d.ts

lib/plugin/make-initial-white-space-siblings.d.ts

lib/plugin/merge-affix-exceptions.d.ts

lib/plugin/merge-affix-symbol.d.ts

lib/plugin/merge-final-word-symbol.d.ts

lib/plugin/merge-initial-digit-sentences.d.ts

lib/plugin/merge-initial-lower-case-letter-sentences.d.ts

lib/plugin/merge-initial-word-symbol.d.ts

lib/plugin/merge-initialisms.d.ts

lib/plugin/merge-inner-word-slash.d.ts

lib/plugin/merge-inner-word-symbol.d.ts

lib/plugin/merge-non-word-sentences.d.ts

lib/plugin/merge-prefix-exceptions.d.ts

lib/plugin/merge-remaining-full-stops.d.ts

lib/plugin/patch-position.d.ts

lib/plugin/remove-empty-nodes.d.ts

522

lib/index.js

		@@ -0,1 +1,12 @@
		/**
		* @typedef {import('vfile').VFile} VFile
		* @typedef {import('nlcst').Parent} Parent
		* @typedef {import('nlcst').Content} Content
		* @typedef {import('nlcst').SentenceContent} SentenceContent
		* @typedef {import('nlcst').Root} Root
		* @typedef {import('nlcst').Paragraph} Paragraph
		* @typedef {import('nlcst').Sentence} Sentence
		*/

		import {toString} from 'nlcst-to-string'
		import {mergeInitialWordSymbol} from './plugin/merge-initial-word-symbol.js'
		@@ -6,3 +17,2 @@ import {mergeFinalWordSymbol} from './plugin/merge-final-word-symbol.js'
		import {mergeInitialisms} from './plugin/merge-initialisms.js'
		import {mergeWords} from './plugin/merge-words.js'
		import {patchPosition} from './plugin/patch-position.js'
		@@ -20,3 +30,2 @@ import {mergeNonWordSentences} from './plugin/merge-non-word-sentences.js'
		import {removeEmptyNodes} from './plugin/remove-empty-nodes.js'
		import {parserFactory} from './parser.js'
		import {
		@@ -35,25 +44,28 @@ newLine,
		export class ParseLatin {
		/**
		* @param {string\|null\|undefined} [doc]
		* @param {VFile\|null\|undefined} [file]
		*/
		constructor(doc, file) {
		const value = file \|\| doc

		/** @type {string\|null} */
		this.doc = value ? String(value) : null
		}

		// Run transform plugins for `key` on `nodes`.
		run(key, nodes) {
		const wareKey = key + 'Plugins'
		const plugins = this[wareKey]
		let index = -1

		if (plugins) {
		while (plugins[++index]) {
		plugins[index](nodes)
		}
		}

		return nodes
		/** @type {Array<(node: Root) => void>} */
		this.tokenizeRootPlugins = [...this.tokenizeRootPlugins]
		/** @type {Array<(node: Paragraph) => void>} */
		this.tokenizeParagraphPlugins = [...this.tokenizeParagraphPlugins]
		/** @type {Array<(node: Sentence) => void>} */
		this.tokenizeSentencePlugins = [...this.tokenizeSentencePlugins]
		}

		// Easy access to the document parser. This additionally supports retext-style
		// invocation: where an instance is created for each file, and the file is given
		// on construction.
		/**
		* Easy access to the document parser.
		* This additionally supports `retext`-like call: where an instance is
		* created for each file, and the file is given on construction.
		*
		* @param {string\|undefined\|null} [value]
		* @returns {Root}
		*/
		parse(value) {
		@@ -63,259 +75,164 @@ return this.tokenizeRoot(value \|\| this.doc)

		// Transform a `value` into a list of `NLCSTNode`s.
		tokenize(value) {
		const tokens = []
		/**
		* Parse as a root.
		*
		* @param {string\|undefined\|null} [value]
		* @returns {Root}
		*/
		tokenizeRoot(value) {
		const paragraph = this.tokenizeParagraph(value)
		/** @type {Root} */
		const result = {
		type: 'RootNode',
		children: splitNode(paragraph, 'WhiteSpaceNode', newLine)
		}

		if (value === null \|\| value === undefined) {
		value = ''
		} else if (value instanceof String) {
		value = value.toString()
		let index = -1
		while (this.tokenizeRootPlugins[++index]) {
		this.tokenizeRootPlugins[index](result)
		}

		if (typeof value !== 'string') {
		// Return the given nodes if this is either an empty array, or an array with
		// a node as a first child.
		if ('length' in value && (!value[0] \|\| value[0].type)) {
		return value
		}
		return result
		}

		throw new Error(
		"Illegal invocation: '" +
		value +
		"' is not a valid argument for 'ParseLatin'"
		)
		/**
		* Parse as a paragraph.
		*
		* @param {string\|undefined\|null} [value]
		* @returns {Paragraph}
		*/
		tokenizeParagraph(value) {
		const sentence = this.tokenizeSentence(value)
		/** @type {Paragraph} */
		const result = {
		type: 'ParagraphNode',
		children: splitNode(sentence, 'PunctuationNode', terminalMarker)
		}

		if (!value) {
		return tokens
		let index = -1
		while (this.tokenizeParagraphPlugins[++index]) {
		this.tokenizeParagraphPlugins[index](result)
		}

		// Eat mechanism to use.
		const eater = this.position ? eat : noPositionEat
		return result
		}

		let index = 0
		let offset = 0
		let line = 1
		let column = 1
		let previous = ''
		let queue = ''
		let left
		let right
		let character
		/**
		* Parse as a sentence.
		*
		* @param {string\|undefined\|null} [value]
		* @returns {Sentence}
		*/
		tokenizeSentence(value) {
		const children = this.tokenize(value)
		/** @type {Sentence} */
		const result = {type: 'SentenceNode', children}

		while (index < value.length) {
		character = value.charAt(index)
		let index = -1
		while (this.tokenizeSentencePlugins[++index]) {
		this.tokenizeSentencePlugins[index](result)
		}

		if (whiteSpace.test(character)) {
		right = 'WhiteSpace'
		} else if (punctuation.test(character)) {
		right = 'Punctuation'
		} else if (word.test(character)) {
		right = 'Word'
		} else {
		right = 'Symbol'
		}
		return result
		}

		tick.call(this)
		/**
		* Transform a `value` into a list of `NLCSTNode`s.
		*
		* @param {string\|undefined\|null} [value]
		* @returns {Array<SentenceContent>}
		*/
		tokenize(value) {
		/** @type {Array<SentenceContent>} */
		const children = []

		previous = character
		character = ''
		left = right
		right = null

		index++
		if (!value) {
		return children
		}

		tick.call(this)
		const currentPoint = {line: 1, column: 1, offset: 0}
		let from = 0
		let index = 0
		let start = {...currentPoint}
		/** @type {SentenceContent['type']\|undefined} */
		let previousType
		/** @type {string\|undefined} */
		let previous

		return tokens
		while (index < value.length) {
		const current = value.charAt(index)
		const currentType = whiteSpace.test(current)
		? 'WhiteSpaceNode'
		: punctuation.test(current)
		? 'PunctuationNode'
		: word.test(current)
		? 'WordNode'
		: 'SymbolNode'

		// Check one character.
		function tick() {
		if (
		left === right &&
		(left === 'Word' \|\|
		left === 'WhiteSpace' \|\|
		character === previous \|\|
		surrogates.test(character))
		from < index &&
		previousType &&
		currentType &&
		!(
		previousType === currentType &&
		// Words or white space continue.
		(previousType === 'WordNode' \|\|
		previousType === 'WhiteSpaceNode' \|\|
		// Same character of punctuation or symbol also continues.
		current === previous \|\|
		// Surrogates of punctuation or symbol also continue.
		surrogates.test(current))
		)
		) {
		queue += character
		} else {
		// Flush the previous queue.
		if (queue) {
		this['tokenize' + left](queue, eater)
		}

		queue = character
		children.push(createNode(previousType, value.slice(from, index)))
		from = index
		start = {...currentPoint}
		}
		}

		// Remove `subvalue` from `value`.
		// Expects `subvalue` to be at the start from `value`, and applies no
		// validation.
		function eat(subvalue) {
		const pos = position()

		update(subvalue)

		return apply

		// Add the given arguments, add `position` to the returned node, and return
		// the node.
		function apply(...input) {
		return pos(add(...input))
		if (current === '\r' \|\| (current === '\n' && previous !== '\r')) {
		currentPoint.line++
		currentPoint.column = 1
		} else if (current !== '\n') {
		currentPoint.column++
		}
		}

		// Remove `subvalue` from `value`.
		// Does not patch positional information.
		function noPositionEat() {
		return add
		currentPoint.offset++
		previousType = currentType
		previous = current
		index++
		}

		// Add mechanism.
		function add(node, parent) {
		if (parent) {
		parent.children.push(node)
		} else {
		tokens.push(node)
		}

		return node
		if (previousType && from < index) {
		children.push(createNode(previousType, value.slice(from, index)))
		}

		// Mark position and patch `node.position`.
		function position() {
		const before = now()
		return children

		// Add the position to a node.
		function patch(node) {
		node.position = new Position(before)

		return node
		}

		return patch
		/**
		*
		* @param {SentenceContent['type']} type
		* @param {string} value
		* @returns {SentenceContent}
		*/
		function createNode(type, value) {
		return type === 'WordNode'
		? {
		type: 'WordNode',
		children: [
		{
		type: 'TextNode',
		value,
		position: {start, end: {...currentPoint}}
		}
		],
		position: {start, end: {...currentPoint}}
		}
		: {type, value, position: {start, end: {...currentPoint}}}
		}

		// Update line and column based on `value`.
		function update(subvalue) {
		let character = -1
		let lastIndex = -1

		offset += subvalue.length

		while (++character < subvalue.length) {
		if (subvalue.charAt(character) === '\n') {
		lastIndex = character
		line++
		}
		}

		if (lastIndex < 0) {
		column += subvalue.length
		} else {
		column = subvalue.length - lastIndex
		}
		}

		// Store position information for a node.
		function Position(start) {
		this.start = start
		this.end = now()
		}

		// Get the current position.
		function now() {
		return {line, column, offset}
		}
		}
		}

		// Default position.
		ParseLatin.prototype.position = true

		// Create text nodes.
		ParseLatin.prototype.tokenizeSymbol = createTextFactory('Symbol')
		ParseLatin.prototype.tokenizeWhiteSpace = createTextFactory('WhiteSpace')
		ParseLatin.prototype.tokenizePunctuation = createTextFactory('Punctuation')
		ParseLatin.prototype.tokenizeSource = createTextFactory('Source')
		ParseLatin.prototype.tokenizeText = createTextFactory('Text')

		// Inject `plugins` to modifiy the result of the method at `key` on the operated
		// on context.
		ParseLatin.prototype.use = useFactory(function (context, key, plugins) {
		context[key] = context[key].concat(plugins)
		})

		// Inject `plugins` to modifiy the result of the method at `key` on the operated
		// on context, before any other.
		ParseLatin.prototype.useFirst = useFactory(function (context, key, plugins) {
		context[key] = plugins.concat(context[key])
		})

		// PARENT NODES
		//
		// All these nodes are `pluggable`: they come with a `use` method which accepts
		// a plugin (`function(NLCSTNode)`).
		// Every time one of these methods are called, the plugin is invoked with the
		// node, allowing for easy modification.
		//
		// In fact, the internal transformation from `tokenize` (a list of words, white
		// space, punctuation, and symbols) to `tokenizeRoot` (an NLCST tree), is also
		// implemented through this mechanism.

		// Create a `WordNode` with its children set to a single `TextNode`, its value
		// set to the given `value`.
		pluggable(ParseLatin, 'tokenizeWord', function (value, eat) {
		const add = (eat \|\| noopEat)('')
		const parent = {type: 'WordNode', children: []}

		this.tokenizeText(value, eat, parent)

		return add(parent)
		})

		// Create a `SentenceNode` with its children set to `Node`s, their values set
		// to the tokenized given `value`.
		//
		// Unless plugins add new nodes, the sentence is populated by `WordNode`s,
		// `SymbolNode`s, `PunctuationNode`s, and `WhiteSpaceNode`s.
		pluggable(
		ParseLatin,
		'tokenizeSentence',
		parserFactory({type: 'SentenceNode', tokenizer: 'tokenize'})
		)

		// Create a `ParagraphNode` with its children set to `Node`s, their values set
		// to the tokenized given `value`.
		//
		// Unless plugins add new nodes, the paragraph is populated by `SentenceNode`s
		// and `WhiteSpaceNode`s.
		pluggable(
		ParseLatin,
		'tokenizeParagraph',
		parserFactory({
		type: 'ParagraphNode',
		delimiter: terminalMarker,
		delimiterType: 'PunctuationNode',
		tokenizer: 'tokenizeSentence'
		})
		)

		// Create a `RootNode` with its children set to `Node`s, their values set to the
		// tokenized given `value`.
		pluggable(
		ParseLatin,
		'tokenizeRoot',
		parserFactory({
		type: 'RootNode',
		delimiter: newLine,
		delimiterType: 'WhiteSpaceNode',
		tokenizer: 'tokenizeParagraph'
		})
		)

		// PLUGINS

		ParseLatin.prototype.use('tokenizeSentence', [
		/** List of transforms handling a sentence. */
		ParseLatin.prototype.tokenizeSentencePlugins = [
		mergeInitialWordSymbol,
		@@ -326,7 +243,7 @@ mergeFinalWordSymbol,
		mergeInitialisms,
		mergeWords,
		patchPosition
		])
		]

		ParseLatin.prototype.use('tokenizeParagraph', [
		/** List of transforms handling a paragraph. */
		ParseLatin.prototype.tokenizeParagraphPlugins = [
		mergeNonWordSentences,
		@@ -344,5 +261,6 @@ mergeAffixSymbol,
		patchPosition
		])
		]

		ParseLatin.prototype.use('tokenizeRoot', [
		/** List of transforms handling a root. */
		ParseLatin.prototype.tokenizeRootPlugins = [
		makeInitialWhiteSpaceSiblings,
		@@ -352,80 +270,48 @@ makeFinalWhiteSpaceSiblings,
		patchPosition
		])
		]

		// TEXT NODES
		/**
		* A function that splits one node into several nodes.
		*
		* @template {Parent} TheNode
		* @param {TheNode} node
		* @param {RegExp} expression
		* @param {Content['type']} childType
		* @returns {Array<TheNode>}
		*/
		function splitNode(node, childType, expression) {
		/** @type {Array<TheNode>} */
		const result = []
		let index = -1
		let start = 0

		// Factory to create a `Text`.
		function createTextFactory(type) {
		type += 'Node'
		while (++index < node.children.length) {
		const token = node.children[index]

		return createText
		if (
		index === node.children.length - 1 \|\|
		(token.type === childType && expression.test(toString(token)))
		) {
		/** @type {TheNode} */
		// @ts-expect-error: fine
		const parent = {
		type: node.type,
		children: node.children.slice(start, index + 1)
		}

		// Construct a `Text` from a bound `type`
		function createText(value, eat, parent) {
		if (value === null \|\| value === undefined) {
		value = ''
		}
		const first = node.children[start]
		const last = token
		if (first.position && last.position) {
		parent.position = {
		start: first.position.start,
		end: last.position.end
		}
		}

		return (eat \|\| noopEat)(value)({type, value: String(value)}, parent)
		}
		}

		// Make a method “pluggable”.
		function pluggable(Constructor, key, callback) {
		// Set a pluggable version of `callback` on `Constructor`.
		Constructor.prototype[key] = function (...input) {
		return this.run(key, callback.apply(this, input))
		}
		}

		// Factory to inject `plugins`. Takes `callback` for the actual inserting.
		function useFactory(callback) {
		return use

		// Validate if `plugins` can be inserted.
		// Invokes the bound `callback` to do the actual inserting.
		function use(key, plugins) {
		// Throw if the method is not pluggable.
		if (!(key in this)) {
		throw new Error(
		'Illegal Invocation: Unsupported `key` for ' +
		'`use(key, plugins)`. Make sure `key` is a ' +
		'supported function'
		)
		result.push(parent)
		start = index + 1
		}

		// Fail silently when no plugins are given.
		if (!plugins) {
		return
		}

		const wareKey = key + 'Plugins'

		// Make sure `plugins` is a list.
		plugins = typeof plugins === 'function' ? [plugins] : plugins.concat()

		// Make sure `wareKey` exists.
		if (!this[wareKey]) {
		this[wareKey] = []
		}

		// Invoke callback with the ware key and plugins.
		callback(this, wareKey, plugins)
		}
		}

		// Add mechanism used when text-tokenisers are called directly outside of the
		// `tokenize` function.
		function noopAdd(node, parent) {
		if (parent) {
		parent.children.push(node)
		}

		return node
		return result
		}

		// Eat and add mechanism without adding positional information, used when
		// text-tokenisers are called directly outside of the `tokenize` function.
		function noopEat() {
		return noopAdd
		}

76

lib/plugin/break-implicit-sentences.js

		@@ -0,1 +1,6 @@
		/**
		* @typedef {import('nlcst').Paragraph} Paragraph
		* @typedef {import('nlcst').Sentence} Sentence
		*/

		import {toString} from 'nlcst-to-string'
		@@ -5,48 +10,51 @@ import {modifyChildren} from 'unist-util-modify-children'
		// Break a sentence if a white space with more than one new-line is found.
		export const breakImplicitSentences = modifyChildren(function (
		child,
		index,
		parent
		) {
		if (child.type !== 'SentenceNode') {
		return
		}
		export const breakImplicitSentences = modifyChildren(
		/**
		* @type {import('unist-util-modify-children').Modifier<Paragraph>}
		*/

		const children = child.children
		function (child, index, parent) {
		if (child.type !== 'SentenceNode') {
		return
		}

		// Ignore first and last child.
		let position = 0
		const children = child.children

		while (++position < children.length - 1) {
		const node = children[position]
		// Ignore first and last child.
		let position = 0

		if (
		node.type !== 'WhiteSpaceNode' \|\|
		toString(node).split(/\r\n\|\r\|\n/).length < 3
		) {
		continue
		}
		while (++position < children.length - 1) {
		const node = children[position]

		child.children = children.slice(0, position)
		if (
		node.type !== 'WhiteSpaceNode' \|\|
		toString(node).split(/\r\n\|\r\|\n/).length < 3
		) {
		continue
		}

		const insertion = {
		type: 'SentenceNode',
		children: children.slice(position + 1)
		}
		child.children = children.slice(0, position)

		const tail = children[position - 1]
		const head = children[position + 1]
		/** @type {Sentence} */
		const insertion = {
		type: 'SentenceNode',
		children: children.slice(position + 1)
		}

		parent.children.splice(index + 1, 0, node, insertion)
		const tail = children[position - 1]
		const head = children[position + 1]

		if (child.position && tail.position && head.position) {
		const end = child.position.end
		parent.children.splice(index + 1, 0, node, insertion)

		child.position.end = tail.position.end
		if (child.position && tail.position && head.position) {
		const end = child.position.end

		insertion.position = {start: head.position.start, end}
		child.position.end = tail.position.end

		insertion.position = {start: head.position.start, end}
		}

		return index + 1
		}

		return index + 1
		}
		})
		)

44

lib/plugin/make-final-white-space-siblings.js

		@@ -0,1 +1,6 @@
		/**
		* @typedef {import('nlcst').Root} Root
		* @typedef {import('nlcst').Paragraph} Paragraph
		*/

		import {modifyChildren} from 'unist-util-modify-children'
		@@ -5,24 +10,25 @@
		// paragraphs.
		export const makeFinalWhiteSpaceSiblings = modifyChildren(function (
		child,
		index,
		parent
		) {
		const children = child.children
		export const makeFinalWhiteSpaceSiblings = modifyChildren(
		/**
		* @type {import('unist-util-modify-children').Modifier<Root \| Paragraph>}
		*/

		if (
		children &&
		children.length > 0 &&
		children[children.length - 1].type === 'WhiteSpaceNode'
		) {
		parent.children.splice(index + 1, 0, child.children.pop())
		const previous = children[children.length - 1]
		function (child, index, parent) {
		if ('children' in child) {
		const tail = child.children[child.children.length - 1]

		if (previous && previous.position && child.position) {
		child.position.end = previous.position.end
		if (tail && tail.type === 'WhiteSpaceNode') {
		child.children.pop() // Remove `tail`.
		parent.children.splice(index + 1, 0, tail)
		const previous = child.children[child.children.length - 1]

		if (previous && previous.position && child.position) {
		child.position.end = previous.position.end
		}

		// Next, iterate over the current node again.
		return index
		}
		}

		// Next, iterate over the current node again.
		return index
		}
		})
		)

37

lib/plugin/make-initial-white-space-siblings.js

		@@ -0,1 +1,6 @@
		/**
		* @typedef {import('nlcst').Root} Root
		* @typedef {import('nlcst').Paragraph} Paragraph
		*/

		import {visitChildren} from 'unist-util-visit-children'
		@@ -5,20 +10,20 @@
		// sentences.
		export const makeInitialWhiteSpaceSiblings = visitChildren(function (
		child,
		index,
		parent
		) {
		const children = child.children
		if (
		children &&
		children.length > 0 &&
		children[0].type === 'WhiteSpaceNode'
		) {
		parent.children.splice(index, 0, children.shift())
		const next = children[0]
		export const makeInitialWhiteSpaceSiblings = visitChildren(
		/**
		* @type {import('unist-util-visit-children').Visitor<Paragraph\|Root>}
		*/
		function (child, index, parent) {
		if ('children' in child && child.children) {
		const head = child.children[0]
		if (head && head.type === 'WhiteSpaceNode') {
		child.children.shift()
		parent.children.splice(index, 0, head)
		const next = child.children[0]

		if (next && next.position && child.position) {
		child.position.start = next.position.start
		if (next && next.position && child.position) {
		child.position.start = next.position.start
		}
		}
		}
		}
		})
		)

69

lib/plugin/merge-affix-exceptions.js

		@@ -0,1 +1,5 @@
		/**
		* @typedef {import('nlcst').Paragraph} Paragraph
		*/

		import {toString} from 'nlcst-to-string'
		@@ -6,43 +10,46 @@ import {modifyChildren} from 'unist-util-modify-children'
		// comma.
		export const mergeAffixExceptions = modifyChildren(function (
		child,
		index,
		parent
		) {
		const children = child.children
		export const mergeAffixExceptions = modifyChildren(
		/**
		* @type {import('unist-util-modify-children').Modifier<Paragraph>}
		*/
		function (child, index, parent) {
		const previous = parent.children[index - 1]

		if (!children \|\| children.length === 0 \|\| index < 1) {
		return
		}
		if (
		previous &&
		'children' in previous &&
		'children' in child &&
		child.children.length > 0
		) {
		let position = -1

		let position = -1
		while (child.children[++position]) {
		const node = child.children[position]

		while (children[++position]) {
		const node = children[position]
		if (node.type === 'WordNode') {
		return
		}

		if (node.type === 'WordNode') {
		return
		}
		if (node.type === 'SymbolNode' \|\| node.type === 'PunctuationNode') {
		const value = toString(node)

		if (node.type === 'SymbolNode' \|\| node.type === 'PunctuationNode') {
		const value = toString(node)
		if (value !== ',' && value !== ';') {
		return
		}

		if (value !== ',' && value !== ';') {
		return
		}
		previous.children.push(...child.children)

		const previousChild = parent.children[index - 1]
		previousChild.children = previousChild.children.concat(children)
		// Update position.
		if (previous.position && child.position) {
		previous.position.end = child.position.end
		}

		// Update position.
		if (previousChild.position && child.position) {
		previousChild.position.end = child.position.end
		parent.children.splice(index, 1)

		// Next, iterate over the node now at the current position.
		return index
		}
		}

		parent.children.splice(index, 1)

		// Next, iterate over the node now at the current position.
		return index
		}
		}
		})
		)

52

lib/plugin/merge-affix-symbol.js

		@@ -0,1 +1,5 @@
		/**
		* @typedef {import('nlcst').Paragraph} Paragraph
		*/

		import {toString} from 'nlcst-to-string'
		@@ -11,29 +15,35 @@ import {modifyChildren} from 'unist-util-modify-children'
		// sentence) to the previous sentence.
		export const mergeAffixSymbol = modifyChildren(function (child, index, parent) {
		const children = child.children
		export const mergeAffixSymbol = modifyChildren(
		/**
		* @type {import('unist-util-modify-children').Modifier<Paragraph>}
		*/
		function (child, index, parent) {
		if ('children' in child && child.children.length > 0 && index > 0) {
		const previous = parent.children[index - 1]
		const first = child.children[0]
		const second = child.children[1]

		if (children && children.length > 0 && index > 0) {
		const first = children[0]
		const second = children[1]
		const previous = parent.children[index - 1]
		if (
		previous &&
		previous.type === 'SentenceNode' &&
		(first.type === 'SymbolNode' \|\| first.type === 'PunctuationNode') &&
		affixSymbol.test(toString(first))
		) {
		child.children.shift() // Remove `first`.
		previous.children.push(first)

		if (
		(first.type === 'SymbolNode' \|\| first.type === 'PunctuationNode') &&
		affixSymbol.test(toString(first))
		) {
		previous.children.push(children.shift())
		// Update position.
		if (first.position && previous.position) {
		previous.position.end = first.position.end
		}

		// Update position.
		if (first.position && previous.position) {
		previous.position.end = first.position.end
		}
		if (second && second.position && child.position) {
		child.position.start = second.position.start
		}

		if (second && second.position && child.position) {
		child.position.start = second.position.start
		// Next, iterate over the previous node again.
		return index - 1
		}

		// Next, iterate over the previous node again.
		return index - 1
		}
		}
		})
		)

63

lib/plugin/merge-final-word-symbol.js

		@@ -0,1 +1,5 @@
		/**
		* @typedef {import('nlcst').Sentence} Sentence
		*/

		import {toString} from 'nlcst-to-string'
		@@ -5,37 +9,38 @@ import {modifyChildren} from 'unist-util-modify-children'
		// Merge certain punctuation marks into their preceding words.
		export const mergeFinalWordSymbol = modifyChildren(function (
		child,
		index,
		parent
		) {
		if (
		index > 0 &&
		(child.type === 'SymbolNode' \|\| child.type === 'PunctuationNode') &&
		toString(child) === '-'
		) {
		const children = parent.children
		const previous = children[index - 1]
		const next = children[index + 1]

		export const mergeFinalWordSymbol = modifyChildren(
		/**
		* @type {import('unist-util-modify-children').Modifier<Sentence>}
		*/
		function (child, index, parent) {
		if (
		(!next \|\| next.type !== 'WordNode') &&
		previous &&
		previous.type === 'WordNode'
		index > 0 &&
		(child.type === 'SymbolNode' \|\| child.type === 'PunctuationNode') &&
		toString(child) === '-'
		) {
		// Remove `child` from parent.
		children.splice(index, 1)
		const children = parent.children
		const previous = children[index - 1]
		const next = children[index + 1]

		// Add the punctuation mark at the end of the previous node.
		previous.children.push(child)
		if (
		(!next \|\| next.type !== 'WordNode') &&
		previous &&
		previous.type === 'WordNode'
		) {
		// Remove `child` from parent.
		children.splice(index, 1)

		// Update position.
		if (previous.position && child.position) {
		previous.position.end = child.position.end
		// Add the punctuation mark at the end of the previous node.
		previous.children.push(child)

		// Update position.
		if (previous.position && child.position) {
		previous.position.end = child.position.end
		}

		// Next, iterate over the node now at the current position (which was
		// the next node).
		return index
		}

		// Next, iterate over the node now at the current position (which was
		// the next node).
		return index
		}
		}
		})
		)

53

lib/plugin/merge-initial-digit-sentences.js

		@@ -0,1 +1,5 @@
		/**
		* @typedef {import('nlcst').Paragraph} Paragraph
		*/

		import {toString} from 'nlcst-to-string'
		@@ -7,29 +11,30 @@ import {modifyChildren} from 'unist-util-modify-children'
		// lower case letter.
		export const mergeInitialDigitSentences = modifyChildren(function (
		child,
		index,
		parent
		) {
		const children = child.children
		const siblings = parent.children
		const previous = siblings[index - 1]
		const head = children[0]
		export const mergeInitialDigitSentences = modifyChildren(
		/**
		* @type {import('unist-util-modify-children').Modifier<Paragraph>}
		*/
		function (child, index, parent) {
		const previous = parent.children[index - 1]

		if (
		previous &&
		head &&
		head.type === 'WordNode' &&
		digitStart.test(toString(head))
		) {
		previous.children = previous.children.concat(children)
		siblings.splice(index, 1)
		if (
		previous &&
		previous.type === 'SentenceNode' &&
		child.type === 'SentenceNode'
		) {
		const head = child.children[0]

		// Update position.
		if (previous.position && child.position) {
		previous.position.end = child.position.end
		if (head && head.type === 'WordNode' && digitStart.test(toString(head))) {
		previous.children.push(...child.children)
		parent.children.splice(index, 1)

		// Update position.
		if (previous.position && child.position) {
		previous.position.end = child.position.end
		}

		// Next, iterate over the node now at the current position.
		return index
		}
		}

		// Next, iterate over the node now at the current position.
		return index
		}
		})
		)

63

lib/plugin/merge-initial-lower-case-letter-sentences.js

		@@ -0,1 +1,5 @@
		/**
		* @typedef {import('nlcst').Paragraph} Paragraph
		*/

		import {toString} from 'nlcst-to-string'
		@@ -9,41 +13,42 @@ import {modifyChildren} from 'unist-util-modify-children'
		// lower case letter.
		export const mergeInitialLowerCaseLetterSentences = modifyChildren(function (
		child,
		index,
		parent
		) {
		const children = child.children
		export const mergeInitialLowerCaseLetterSentences = modifyChildren(
		/**
		* @type {import('unist-util-modify-children').Modifier<Paragraph>}
		*/
		function (child, index, parent) {
		if (child.type === 'SentenceNode' && index > 0) {
		const previous = parent.children[index - 1]
		const children = child.children

		if (children && children.length > 0 && index > 0) {
		let position = -1
		if (children.length > 0 && previous.type === 'SentenceNode') {
		let position = -1

		while (children[++position]) {
		const node = children[position]
		while (children[++position]) {
		const node = children[position]

		if (node.type === 'WordNode') {
		if (!lowerInitial.test(toString(node))) {
		return
		}
		if (node.type === 'WordNode') {
		if (!lowerInitial.test(toString(node))) {
		return
		}

		const siblings = parent.children
		const previous = siblings[index - 1]
		previous.children.push(...children)

		previous.children = previous.children.concat(children)
		parent.children.splice(index, 1)

		siblings.splice(index, 1)
		// Update position.
		if (previous.position && child.position) {
		previous.position.end = child.position.end
		}

		// Update position.
		if (previous.position && child.position) {
		previous.position.end = child.position.end
		// Next, iterate over the node now at the current position.
		return index
		}

		if (node.type === 'SymbolNode' \|\| node.type === 'PunctuationNode') {
		return
		}
		}

		// Next, iterate over the node now at the current position.
		return index
		}

		if (node.type === 'SymbolNode' \|\| node.type === 'PunctuationNode') {
		return
		}
		}
		}
		})
		)

69

lib/plugin/merge-initial-word-symbol.js

		@@ -0,1 +1,5 @@
		/**
		* @typedef {import('nlcst').Sentence} Sentence
		*/

		import {toString} from 'nlcst-to-string'
		@@ -5,39 +9,40 @@ import {modifyChildren} from 'unist-util-modify-children'
		// Merge certain punctuation marks into their following words.
		export const mergeInitialWordSymbol = modifyChildren(function (
		child,
		index,
		parent
		) {
		if (
		(child.type !== 'SymbolNode' && child.type !== 'PunctuationNode') \|\|
		toString(child) !== '&'
		) {
		return
		}
		export const mergeInitialWordSymbol = modifyChildren(
		/**
		* @type {import('unist-util-modify-children').Modifier<Sentence>}
		*/
		function (child, index, parent) {
		if (
		(child.type !== 'SymbolNode' && child.type !== 'PunctuationNode') \|\|
		toString(child) !== '&'
		) {
		return
		}

		const children = parent.children
		const next = children[index + 1]
		const children = parent.children
		const next = children[index + 1]

		// If either a previous word, or no following word, exists, exit early.
		if (
		(index > 0 && children[index - 1].type === 'WordNode') \|\|
		!(next && next.type === 'WordNode')
		) {
		return
		}
		// If either a previous word, or no following word, exists, exit early.
		if (
		(index > 0 && children[index - 1].type === 'WordNode') \|\|
		!(next && next.type === 'WordNode')
		) {
		return
		}

		// Remove `child` from parent.
		children.splice(index, 1)
		// Remove `child` from parent.
		children.splice(index, 1)

		// Add the punctuation mark at the start of the next node.
		next.children.unshift(child)
		// Add the punctuation mark at the start of the next node.
		next.children.unshift(child)

		// Update position.
		if (next.position && child.position) {
		next.position.start = child.position.start
		// Update position.
		if (next.position && child.position) {
		next.position.start = child.position.start
		}

		// Next, iterate over the node at the previous position, as it's now adjacent
		// to a following word.
		return index - 1
		}

		// Next, iterate over the node at the previous position, as it's now adjacent
		// to a following word.
		return index - 1
		})
		)

94

lib/plugin/merge-initialisms.js

		@@ -0,1 +1,5 @@
		/**
		* @typedef {import('nlcst').Sentence} Sentence
		*/

		import {toString} from 'nlcst-to-string'
		@@ -6,58 +10,64 @@ import {modifyChildren} from 'unist-util-modify-children'
		// Merge initialisms.
		export const mergeInitialisms = modifyChildren(function (child, index, parent) {
		if (index > 0 && toString(child) === '.') {
		const siblings = parent.children

		const previous = siblings[index - 1]
		const children = previous.children

		export const mergeInitialisms = modifyChildren(
		/**
		* @type {import('unist-util-modify-children').Modifier<Sentence>}
		*/
		function (child, index, parent) {
		if (
		previous.type === 'WordNode' &&
		children &&
		children.length !== 1 &&
		children.length % 2 !== 0
		index > 0 &&
		child.type === 'PunctuationNode' &&
		toString(child) === '.'
		) {
		let position = children.length
		let isAllDigits = true
		const previous = parent.children[index - 1]

		while (children[--position]) {
		const otherChild = children[position]
		if (
		previous.type === 'WordNode' &&
		previous.children &&
		previous.children.length !== 1 &&
		previous.children.length % 2 !== 0
		) {
		let position = previous.children.length
		let isAllDigits = true

		const value = toString(otherChild)
		while (previous.children[--position]) {
		const otherChild = previous.children[position]

		if (position % 2 === 0) {
		// Initialisms consist of one character values.
		if (value.length > 1) {
		return
		}
		const value = toString(otherChild)

		if (!numerical.test(value)) {
		isAllDigits = false
		if (position % 2 === 0) {
		// Initialisms consist of one character values.
		if (value.length > 1) {
		return
		}

		if (!numerical.test(value)) {
		isAllDigits = false
		}
		} else if (value !== '.') {
		if (position < previous.children.length - 2) {
		break
		} else {
		return
		}
		}
		} else if (value !== '.') {
		if (position < children.length - 2) {
		break
		} else {
		return
		}
		}
		}

		if (!isAllDigits) {
		// Remove `child` from parent.
		siblings.splice(index, 1)
		if (!isAllDigits) {
		// Remove `child` from parent.
		parent.children.splice(index, 1)

		// Add child to the previous children.
		children.push(child)
		// Add child to the previous children.
		previous.children.push(child)

		// Update position.
		if (previous.position && child.position) {
		previous.position.end = child.position.end
		// Update position.
		if (previous.position && child.position) {
		previous.position.end = child.position.end
		}

		// Next, iterate over the node now at the current position.
		return index
		}

		// Next, iterate over the node now at the current position.
		return index
		}
		}
		}
		})
		)

83

lib/plugin/merge-inner-word-slash.js

		@@ -0,50 +1,57 @@
		/**
		* @typedef {import('nlcst').SentenceContent} SentenceContent
		* @typedef {import('nlcst').WordContent} WordContent
		* @typedef {import('nlcst').Sentence} Sentence
		*/

		import {toString} from 'nlcst-to-string'
		import {modifyChildren} from 'unist-util-modify-children'

		const slash = '/'

		// Merge words joined by certain punctuation marks.
		export const mergeInnerWordSlash = modifyChildren(function (
		child,
		index,
		parent
		) {
		const siblings = parent.children
		const previous = siblings[index - 1]
		const next = siblings[index + 1]
		export const mergeInnerWordSlash = modifyChildren(
		/**
		* @type {import('unist-util-modify-children').Modifier<Sentence>}
		*/
		function (child, index, parent) {
		const siblings = parent.children
		const previous = siblings[index - 1]

		if (
		previous &&
		previous.type === 'WordNode' &&
		(child.type === 'SymbolNode' \|\| child.type === 'PunctuationNode') &&
		toString(child) === slash
		) {
		const previousValue = toString(previous)
		let tail = child
		let queue = [child]
		let count = 1
		let nextValue = ''
		if (
		previous &&
		previous.type === 'WordNode' &&
		(child.type === 'SymbolNode' \|\| child.type === 'PunctuationNode') &&
		toString(child) === '/'
		) {
		const previousValue = toString(previous)
		/** @type {SentenceContent} */
		let tail = child
		/** @type {Array<WordContent>} */
		const queue = [child]
		let count = 1
		let nextValue = ''
		const next = siblings[index + 1]

		if (next && next.type === 'WordNode') {
		nextValue = toString(next)
		tail = next
		queue = queue.concat(next.children)
		count++
		}
		if (next && next.type === 'WordNode') {
		nextValue = toString(next)
		tail = next
		queue.push(...next.children)
		count++
		}

		if (previousValue.length < 3 && (!nextValue \|\| nextValue.length < 3)) {
		// Add all found tokens to `prev`s children.
		previous.children = previous.children.concat(queue)
		if (previousValue.length < 3 && (!nextValue \|\| nextValue.length < 3)) {
		// Add all found tokens to `prev`s children.
		previous.children.push(...queue)

		siblings.splice(index, count)
		siblings.splice(index, count)

		// Update position.
		if (previous.position && tail.position) {
		previous.position.end = tail.position.end
		// Update position.
		if (previous.position && tail.position) {
		previous.position.end = tail.position.end
		}

		// Next, iterate over the node now at the current position.
		return index
		}

		// Next, iterate over the node now at the current position.
		return index
		}
		}
		})
		)

110

lib/plugin/merge-inner-word-symbol.js

		@@ -0,1 +1,6 @@
		/**
		* @typedef {import('nlcst').Sentence} Sentence
		* @typedef {import('nlcst').WordContent} WordContent
		*/

		import {toString} from 'nlcst-to-string'
		@@ -8,66 +13,69 @@ import {modifyChildren} from 'unist-util-modify-children'
		// Merge words joined by certain punctuation marks.
		export const mergeInnerWordSymbol = modifyChildren(function (
		child,
		index,
		parent
		) {
		if (
		index > 0 &&
		(child.type === 'SymbolNode' \|\| child.type === 'PunctuationNode')
		) {
		const siblings = parent.children
		const previous = siblings[index - 1]
		export const mergeInnerWordSymbol = modifyChildren(
		/**
		* @type {import('unist-util-modify-children').Modifier<Sentence>}
		*/
		function (child, index, parent) {
		if (
		index > 0 &&
		(child.type === 'SymbolNode' \|\| child.type === 'PunctuationNode')
		) {
		const siblings = parent.children
		const previous = siblings[index - 1]

		if (previous && previous.type === 'WordNode') {
		let position = index - 1
		let tokens = []
		let queue = []
		if (previous && previous.type === 'WordNode') {
		let position = index - 1
		/** @type {Array<WordContent>} */
		const tokens = []
		/** @type {Array<WordContent>} */
		let queue = []

		// - If a token which is neither word nor inner word symbol is found,
		// the loop is broken
		// - If an inner word symbol is found, it’s queued
		// - If a word is found, it’s queued (and the queue stored and emptied)
		while (siblings[++position]) {
		const sibling = siblings[position]
		// - If a token which is neither word nor inner word symbol is found,
		// the loop is broken
		// - If an inner word symbol is found, it’s queued
		// - If a word is found, it’s queued (and the queue stored and emptied)
		while (siblings[++position]) {
		const sibling = siblings[position]

		if (sibling.type === 'WordNode') {
		tokens = tokens.concat(queue, sibling.children)
		if (sibling.type === 'WordNode') {
		tokens.push(...queue, ...sibling.children)

		queue = []
		} else if (
		(sibling.type === 'SymbolNode' \|\|
		sibling.type === 'PunctuationNode') &&
		wordSymbolInner.test(toString(sibling))
		) {
		queue.push(sibling)
		} else {
		break
		queue = []
		} else if (
		(sibling.type === 'SymbolNode' \|\|
		sibling.type === 'PunctuationNode') &&
		wordSymbolInner.test(toString(sibling))
		) {
		queue.push(sibling)
		} else {
		break
		}
		}
		}

		if (tokens.length > 0) {
		// If there is a queue, remove its length from `position`.
		if (queue.length > 0) {
		position -= queue.length
		}
		if (tokens.length > 0) {
		// If there is a queue, remove its length from `position`.
		if (queue.length > 0) {
		position -= queue.length
		}

		// Remove every (one or more) inner-word punctuation marks and children
		// of words.
		siblings.splice(index, position - index)
		// Remove every (one or more) inner-word punctuation marks and children
		// of words.
		siblings.splice(index, position - index)

		// Add all found tokens to `prev`s children.
		previous.children = previous.children.concat(tokens)
		// Add all found tokens to `prev`s children.
		previous.children.push(...tokens)

		const last = tokens[tokens.length - 1]
		const last = tokens[tokens.length - 1]

		// Update position.
		if (previous.position && last.position) {
		previous.position.end = last.position.end
		// Update position.
		if (previous.position && last.position) {
		previous.position.end = last.position.end
		}

		// Next, iterate over the node now at the current position.
		return index
		}

		// Next, iterate over the node now at the current position.
		return index
		}
		}
		}
		})
		)

76

lib/plugin/merge-non-word-sentences.js

		@@ -0,1 +1,5 @@
		/**
		* @typedef {import('nlcst').Paragraph} Paragraph
		*/

		import {modifyChildren} from 'unist-util-modify-children'
		@@ -5,47 +9,49 @@
		// contain word tokens.
		export const mergeNonWordSentences = modifyChildren(function (
		child,
		index,
		parent
		) {
		const children = child.children
		let position = -1
		export const mergeNonWordSentences = modifyChildren(
		/**
		* @type {import('unist-util-modify-children').Modifier<Paragraph>}
		*/
		function (child, index, parent) {
		if ('children' in child) {
		let position = -1

		while (children[++position]) {
		if (children[position].type === 'WordNode') {
		return
		}
		}
		while (child.children[++position]) {
		if (child.children[position].type === 'WordNode') {
		return
		}
		}

		const previous = parent.children[index - 1]
		const previous = parent.children[index - 1]

		if (previous) {
		previous.children = previous.children.concat(children)
		if (previous && 'children' in previous) {
		previous.children.push(...child.children)

		// Remove the child.
		parent.children.splice(index, 1)
		// Remove the child.
		parent.children.splice(index, 1)

		// Patch position.
		if (previous.position && child.position) {
		previous.position.end = child.position.end
		}
		// Patch position.
		if (previous.position && child.position) {
		previous.position.end = child.position.end
		}

		// Next, iterate over the node now at the current position (which was the
		// next node).
		return index
		}
		// Next, iterate over the node now at the current position (which was the
		// next node).
		return index
		}

		const next = parent.children[index + 1]
		const next = parent.children[index + 1]

		if (next) {
		next.children = children.concat(next.children)
		if (next && 'children' in next) {
		next.children.unshift(...child.children)

		// Patch position.
		if (next.position && child.position) {
		next.position.start = child.position.start
		// Patch position.
		if (next.position && child.position) {
		next.position.start = child.position.start
		}

		// Remove the child.
		parent.children.splice(index, 1)
		}
		}

		// Remove the child.
		parent.children.splice(index, 1)
		}
		})
		)

74

lib/plugin/merge-prefix-exceptions.js

		@@ -0,1 +1,5 @@
		/**
		* @typedef {import('nlcst').Paragraph} Paragraph
		*/

		import {toString} from 'nlcst-to-string'
		@@ -26,44 +30,46 @@ import {modifyChildren} from 'unist-util-modify-children'
		// certain word.
		export const mergePrefixExceptions = modifyChildren(function (
		child,
		index,
		parent
		) {
		const children = child.children
		export const mergePrefixExceptions = modifyChildren(
		/**
		* @type {import('unist-util-modify-children').Modifier<Paragraph>}
		*/
		function (child, index, parent) {
		if ('children' in child && child.children.length > 1) {
		const period = child.children[child.children.length - 1]

		if (children && children.length > 1) {
		const period = children[children.length - 1]

		if (period && toString(period) === '.') {
		const node = children[children.length - 2]

		if (
		node &&
		node.type === 'WordNode' &&
		abbreviationPrefix.test(toString(node).toLowerCase())
		period &&
		(period.type === 'PunctuationNode' \|\| period.type === 'SymbolNode') &&
		toString(period) === '.'
		) {
		// Merge period into abbreviation.
		node.children.push(period)
		children.pop()
		const node = child.children[child.children.length - 2]

		// Update position.
		if (period.position && node.position) {
		node.position.end = period.position.end
		}
		if (
		node &&
		node.type === 'WordNode' &&
		abbreviationPrefix.test(toString(node).toLowerCase())
		) {
		// Merge period into abbreviation.
		node.children.push(period)
		child.children.pop()

		// Merge sentences.
		const next = parent.children[index + 1]
		// Update position.
		if (period.position && node.position) {
		node.position.end = period.position.end
		}

		if (next) {
		child.children = children.concat(next.children)
		// Merge sentences.
		const next = parent.children[index + 1]

		parent.children.splice(index + 1, 1)
		if (next && next.type === 'SentenceNode') {
		child.children.push(...next.children)
		parent.children.splice(index + 1, 1)

		// Update position.
		if (next.position && child.position) {
		child.position.end = next.position.end
		// Update position.
		if (next.position && child.position) {
		child.position.end = next.position.end
		}

		// Next, iterate over the current node again.
		return index - 1
		}

		// Next, iterate over the current node again.
		return index - 1
		}
		@@ -73,2 +79,2 @@ }
		}
		})
		)

136

lib/plugin/merge-remaining-full-stops.js

		@@ -0,1 +1,5 @@
		/**
		* @typedef {import('nlcst').Paragraph} Paragraph
		*/

		import {toString} from 'nlcst-to-string'
		@@ -10,82 +14,88 @@ import {visitChildren} from 'unist-util-visit-children'
		// or the next word (if available).
		export const mergeRemainingFullStops = visitChildren(function (child) {
		const children = child.children
		let position = children.length
		let hasFoundDelimiter = false
		export const mergeRemainingFullStops = visitChildren(
		/**
		* @type {import('unist-util-visit-children').Visitor<Paragraph>}
		*/
		// eslint-disable-next-line complexity
		function (child, _, _parent) {
		if ('children' in child) {
		let position = child.children.length
		let hasFoundDelimiter = false

		while (children[--position]) {
		const grandchild = children[position]
		while (child.children[--position]) {
		const grandchild = child.children[position]

		if (
		grandchild.type !== 'SymbolNode' &&
		grandchild.type !== 'PunctuationNode'
		) {
		// This is a sentence without terminal marker, so we 'fool' the code to
		// make it think we have found one.
		if (grandchild.type === 'WordNode') {
		hasFoundDelimiter = true
		}
		if (
		grandchild.type !== 'SymbolNode' &&
		grandchild.type !== 'PunctuationNode'
		) {
		// This is a sentence without terminal marker, so we 'fool' the code to
		// make it think we have found one.
		if (grandchild.type === 'WordNode') {
		hasFoundDelimiter = true
		}

		continue
		}
		continue
		}

		// Exit when this token is not a terminal marker.
		if (!terminalMarker.test(toString(grandchild))) {
		continue
		}
		// Exit when this token is not a terminal marker.
		if (!terminalMarker.test(toString(grandchild))) {
		continue
		}

		// Ignore the first terminal marker found (starting at the end), as it
		// should not be merged.
		if (!hasFoundDelimiter) {
		hasFoundDelimiter = true
		// Ignore the first terminal marker found (starting at the end), as it
		// should not be merged.
		if (!hasFoundDelimiter) {
		hasFoundDelimiter = true
		continue
		}

		continue
		}
		// Only merge a single full stop.
		if (toString(grandchild) !== '.') {
		continue
		}

		// Only merge a single full stop.
		if (toString(grandchild) !== '.') {
		continue
		}
		const previous = child.children[position - 1]
		const next = child.children[position + 1]

		const previous = children[position - 1]
		const next = children[position + 1]
		if (previous && previous.type === 'WordNode') {
		const nextNext = child.children[position + 2]

		if (previous && previous.type === 'WordNode') {
		const nextNext = children[position + 2]
		// Continue when the full stop is followed by a space and another full
		// stop, such as: `{.} .`
		if (
		next &&
		nextNext &&
		next.type === 'WhiteSpaceNode' &&
		toString(nextNext) === '.'
		) {
		continue
		}

		// Continue when the full stop is followed by a space and another full
		// stop, such as: `{.} .`
		if (
		next &&
		nextNext &&
		next.type === 'WhiteSpaceNode' &&
		toString(nextNext) === '.'
		) {
		continue
		}
		// Remove `child` from parent.
		child.children.splice(position, 1)

		// Remove `child` from parent.
		children.splice(position, 1)
		// Add the punctuation mark at the end of the previous node.
		previous.children.push(grandchild)

		// Add the punctuation mark at the end of the previous node.
		previous.children.push(grandchild)
		// Update position.
		if (grandchild.position && previous.position) {
		previous.position.end = grandchild.position.end
		}

		// Update position.
		if (grandchild.position && previous.position) {
		previous.position.end = grandchild.position.end
		}
		position--
		} else if (next && next.type === 'WordNode') {
		// Remove `child` from parent.
		child.children.splice(position, 1)

		position--
		} else if (next && next.type === 'WordNode') {
		// Remove `child` from parent.
		children.splice(position, 1)
		// Add the punctuation mark at the start of the next node.
		next.children.unshift(grandchild)

		// Add the punctuation mark at the start of the next node.
		next.children.unshift(grandchild)

		if (grandchild.position && next.position) {
		next.position.start = grandchild.position.start
		if (grandchild.position && next.position) {
		next.position.start = grandchild.position.start
		}
		}
		}
		}
		}
		})
		)

54

lib/plugin/patch-position.js

		@@ -0,31 +1,47 @@
		/**
		* @typedef {import('unist').Node} Node
		* @typedef {import('nlcst').Sentence} Sentence
		* @typedef {import('nlcst').Paragraph} Paragraph
		* @typedef {import('nlcst').Root} Root
		*/

		import {visitChildren} from 'unist-util-visit-children'

		// Patch the position on a parent node based on its first and last child.
		export const patchPosition = visitChildren(function (child, index, node) {
		const siblings = node.children
		export const patchPosition = visitChildren(
		/**
		* @type {import('unist-util-visit-children').Visitor<Paragraph\|Sentence\|Root>}
		*/
		function (child, index, node) {
		const siblings = node.children

		if (!child.position) {
		return
		}
		if (
		index < 1 &&
		/* c8 ignore next */
		(!node.position \|\| !node.position.start)
		) {
		patch(node)
		// @ts-expect-error: we just set it.
		node.position.start = child.position.start
		}

		if (
		index < 1 &&
		/* c8 ignore next */
		(!node.position \|\| !node.position.start)
		) {
		patch(node)
		node.position.start = child.position.start
		if (
		index === siblings.length - 1 &&
		(!node.position \|\| !node.position.end)
		) {
		patch(node)
		// @ts-expect-error: we just set it.
		node.position.end = child.position.end
		}
		}
		)

		if (index === siblings.length - 1 && (!node.position \|\| !node.position.end)) {
		patch(node)
		node.position.end = child.position.end
		}
		})

		// Add a `position` object when it does not yet exist on `node`.
		/**
		* @param {Node} node
		*/
		function patch(node) {
		if (!node.position) {
		// @ts-expect-error: fine.
		node.position = {}
		}
		}

25

lib/plugin/remove-empty-nodes.js

+@@ -0,12 +1,23 @@
+/**
+ * @typedef {import('nlcst').Root} Root
+ * @typedef {import('nlcst').Paragraph} Paragraph
+ */
+import {modifyChildren} from 'unist-util-modify-children'
+// Remove empty children.
+export const removeEmptyNodes = modifyChildren(function (child, index, parent) {
+  if ('children' in child && child.children.length === 0) {
+    parent.children.splice(index, 1)
+export const removeEmptyNodes = modifyChildren(
+  /**
+   * @type {import('unist-util-modify-children').Modifier<Root | Paragraph>}
+   */
+    // Next, iterate over the node *now* at the current position (which was the
+    // next node).
+    return index
+  function (child, index, parent) {
+    if ('children' in child && child.children.length === 0) {
+      parent.children.splice(index, 1)
+      // Next, iterate over the node *now* at the current position (which was the
+      // next node).
+      return index
+})

25

package.json

		{
		"name": "parse-latin",
		"version": "5.0.1",
		"version": "6.0.0",
		"description": "Latin-script (natural language) parser",
		@@ -27,16 +27,22 @@ "license": "MIT",
		"main": "index.js",
		"types": "index.d.ts",
		"files": [
		"lib/",
		"index.d.ts",
		"index.js"
		],
		"dependencies": {
		"@types/nlcst": "^1.0.0",
		"@types/unist": "^2.0.0",
		"nlcst-to-string": "^3.0.0",
		"unist-util-modify-children": "^3.0.0",
		"unist-util-visit-children": "^2.0.0"
		"unist-util-visit-children": "^2.0.0",
		"vfile": "^5.0.0"
		},
		"devDependencies": {
		"@types/node": "^18.0.0",
		"@types/regenerate": "^1.0.0",
		"@unicode/unicode-13.0.0": "^1.0.0",
		"c8": "^7.0.0",
		"is-hidden": "^2.0.0",
		"negate": "^1.0.0",
		"nlcst-test": "^3.0.0",
		@@ -48,9 +54,11 @@ "nyc": "^15.0.0",
		"remark-preset-wooorm": "^9.0.0",
		"type-coverage": "^2.0.0",
		"typescript": "^4.0.0",
		"unist-util-remove-position": "^4.0.0",
		"vfile": "^5.0.0",
		"xo": "^0.52.0"
		},
		"scripts": {
		"prepack": "npm run generate && npm run format",
		"prepack": "npm run generate && npm run build && npm run format",
		"fixture": "node script/generate-fixture.js",
		"build": "tsc --build --clean && tsc --build && type-coverage",
		"generate": "node script/build-expressions.js",
		@@ -60,3 +68,3 @@ "format": "remark . -qfo && prettier . -w --loglevel warn && xo --fix",
		"test-coverage": "c8 --check-coverage --100 --reporter lcov npm run test-api",
		"test": "npm run generate && npm run format && npm run test-coverage"
		"test": "npm run generate && npm run build && npm run format && npm run test-coverage"
		},
		@@ -82,3 +90,8 @@ "prettier": {
		]
		},
		"typeCoverage": {
		"atLeast": 100,
		"detail": true,
		"strict": true
		}
		}

173

readme.md

		@@ -7,22 +7,50 @@ # parse-latin
		[![Size][size-badge]][size]
		[![Chat][chat-badge]][chat]

		A Latin-script language parser for [retext][retext] producing [nlcst][]
		nodes.
		A natural language parser, for Latin-script languages, that produces [nlcst][].

		## Contents

		* [What is this?](#what-is-this)
		* [When should I use this?](#when-should-i-use-this)
		* [Install](#install)
		* [Use](#use)
		* [API](#api)
		* [`ParseLatin()`](#parselatin)
		* [Algorithm](#algorithm)
		* [Types](#types)
		* [Compatibility](#compatibility)
		* [Related](#related)
		* [Contribute](#contribute)
		* [Security](#security)
		* [License](#license)

		## What is this?

		This package exposes a parser that takes Latin-script natural language and
		produces a syntax tree.

		## When should I use this?

		If you want to handle natural language as syntax trees manually, use this.

		Alternatively, you can use the retext plugin [`retext-latin`][retext-latin],
		which wraps this project to also parse natural language at a higher-level
		(easier) abstraction.

		Whether Old-English (“þā gewearþ þǣm hlāforde and þǣm hȳrigmannum wiþ ānum
		penninge”), Icelandic (“Hvað er að frétta”), French (“Où sont les toilettes?”),
		`parse-latin` does a good job at tokenizing it.
		this project does a good job at tokenizing it.

		Note also that `parse-latin` does a decent job at tokenizing Latin-like scripts,
		Cyrillic (“Добро пожаловать!”), Georgian (“როგორა ხარ?”), Armenian (“Շատ հաճելի
		է”), and such.
		For English and Dutch, you can instead use [`parse-english`][parse-english] and
		[`parse-dutch`][parse-dutch].

		You can somewhat use this for Latin-like scripts, such as Cyrillic
		(“Добро пожаловать!”), Georgian (“როგორა ხარ?”), Armenian (“Շատ հաճելի է”),
		and such.

		## Install

		This package is ESM only: Node 12+ is needed to use it and it must be `import`ed
		instead of `require`d.
		This package is [ESM only][esm].
		In Node.js (version 14.14+, 16.0+), install with [npm][]:

		[npm][]:

		```sh
		@@ -32,2 +60,16 @@ npm install parse-latin

		In Deno with [`esm.sh`][esmsh]:

		```js
		import {ParseLatin} from 'https://esm.sh/parse-latin@6'
		```

		In browsers with [`esm.sh`][esmsh]:

		```html
		<script type="module">
		import {ParseLatin} from 'https://esm.sh/parse-latin@6?bundle'
		</script>
		```

		## Use
		@@ -44,3 +86,3 @@

		Which, when inspecting, yields:
		Yields:

		@@ -64,55 +106,76 @@ ```txt

		This package exports the following identifiers: `ParseLatin`.
		This package exports the identifier `ParseLatin`.
		There is no default export.

		### `ParseLatin(value)`
		### `ParseLatin()`

		Exposes the functionality needed to tokenize natural Latin-script languages into
		a syntax tree.
		If `value` is passed here, it’s not needed to give it to `#parse()`.
		Create a new parser.

		#### `ParseLatin#tokenize(value)`
		#### `ParseLatin#parse(value)`

		Tokenize `value` (`string`) into letters and numbers (words), white space, and
		everything else (punctuation).
		The returned nodes are a flat list without paragraphs or sentences.
		Turn natural language into a syntax tree.

		###### Returns
		##### Parameters

		[`Array.<Node>`][nlcst] — Nodes.
		###### `value`

		#### `ParseLatin#parse(value)`
		Value to parse (`string`).

		Tokenize `value` (`string`) into an [NLCST][] tree.
		The returned node is a `RootNode` with in it paragraphs and sentences.
		##### Returns

		###### Returns
		[`RootNode`][root].

		[`Node`][nlcst] — Root node.

		## Algorithm

		> Note: The easiest way to see how parse-latin tokenizes and parses, is by
		> using the [online parser demo][demo], which
		> shows the syntax tree corresponding to the typed text.
		> 👉 Note:
		> The easiest way to see how `parse-latin` parses, is by using the
		> [online parser demo][demo], which shows the syntax tree corresponding to
		> the typed text.

		`parse-latin` splits text into white space, word, and punctuation tokens.
		`parse-latin` starts out with a pretty easy definition, one that most other
		tokenizers use:
		`parse-latin` splits text into white space, punctuation, symbol, and word
		tokens:

		* A “word” is one or more letter or number characters
		* A “white space” is one or more white space characters
		* A “punctuation” is one or more of anything else
		* “word” is one or more unicode letters or numbers
		* “white space” is one or more unicode white space characters
		* “punctuation” is one or more unicode punctuation characters
		* “symbol” is one or more of anything else

		Then, it manipulates and merges those tokens into a ([nlcst][]) syntax tree,
		adding sentences and paragraphs where needed.
		Then, it manipulates and merges those tokens into a syntax tree, adding
		sentences and paragraphs where needed.

		* Some punctuation marks are part of the word they occur in, such as
		* some punctuation marks are part of the word they occur in, such as
		`non-profit`, `she’s`, `G.I.`, `11:00`, `N/A`, `&c`, `nineteenth- and…`
		* Some full-stops do not mark a sentence end, such as `1.`, `e.g.`, `id.`
		* Although full-stops, question marks, and exclamation marks (sometimes) end a
		* some periods do not mark a sentence end, such as `1.`, `e.g.`, `id.`
		* although periods, question marks, and exclamation marks (sometimes) end a
		sentence, that end might not occur directly after the mark, such as `.)`,
		`."`
		* And many more exceptions
		* …and many more exceptions

		## Types

		This package is fully typed with [TypeScript][].
		It exports no additional types.

		## Compatibility

		This package is at least compatible with all maintained versions of Node.js.
		As of now, that is Node.js 14.14+ and 16.0+.
		It also works in Deno and modern browsers.

		## Related

		* [`parse-english`](https://github.com/wooorm/parse-english)
		— English (natural language) parser
		* [`parse-dutch`](https://github.com/wooorm/parse-dutch)
		— Dutch (natural language) parser

		## Contribute

		Yes please!
		See [How to Contribute to Open Source][contribute].

		## Security

		This package is safe.

		## License
		@@ -140,6 +203,2 @@

		[chat-badge]: https://img.shields.io/badge/join%20the%20community-on%20spectrum-7b16ff.svg

		[chat]: https://spectrum.chat/unified/retext

		[npm]: https://docs.npmjs.com/cli/install
		@@ -149,2 +208,10 @@

		[esm]: https://gist.github.com/sindresorhus/a39789f98801d908bbc7ff3ecc99d99c

		[esmsh]: https://esm.sh

		[typescript]: https://www.typescriptlang.org

		[contribute]: https://opensource.guide/how-to-contribute/

		[license]: license
		@@ -154,4 +221,10 @@

		[retext]: https://github.com/retextjs/retext
		[nlcst]: https://github.com/syntax-tree/nlcst

		[nlcst]: https://github.com/syntax-tree/nlcst
		[root]: https://github.com/syntax-tree/nlcst#root

		[retext-latin]: https://github.com/retextjs/retext/tree/main/packages/retext-latin

		[parse-english]: https://github.com/wooorm/parse-english

		[parse-dutch]: https://github.com/wooorm/parse-dutch

lib/parser.js

lib/plugin/merge-words.js

lib/tokenizer.js

parse-latin - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics

Dependency changes