parse-latin
Advanced tools
Comparing version 5.0.1 to 6.0.0
522
lib/index.js
@@ -0,1 +1,12 @@ | ||
/** | ||
* @typedef {import('vfile').VFile} VFile | ||
* @typedef {import('nlcst').Parent} Parent | ||
* @typedef {import('nlcst').Content} Content | ||
* @typedef {import('nlcst').SentenceContent} SentenceContent | ||
* @typedef {import('nlcst').Root} Root | ||
* @typedef {import('nlcst').Paragraph} Paragraph | ||
* @typedef {import('nlcst').Sentence} Sentence | ||
*/ | ||
import {toString} from 'nlcst-to-string' | ||
import {mergeInitialWordSymbol} from './plugin/merge-initial-word-symbol.js' | ||
@@ -6,3 +17,2 @@ import {mergeFinalWordSymbol} from './plugin/merge-final-word-symbol.js' | ||
import {mergeInitialisms} from './plugin/merge-initialisms.js' | ||
import {mergeWords} from './plugin/merge-words.js' | ||
import {patchPosition} from './plugin/patch-position.js' | ||
@@ -20,3 +30,2 @@ import {mergeNonWordSentences} from './plugin/merge-non-word-sentences.js' | ||
import {removeEmptyNodes} from './plugin/remove-empty-nodes.js' | ||
import {parserFactory} from './parser.js' | ||
import { | ||
@@ -35,25 +44,28 @@ newLine, | ||
export class ParseLatin { | ||
/** | ||
* @param {string|null|undefined} [doc] | ||
* @param {VFile|null|undefined} [file] | ||
*/ | ||
constructor(doc, file) { | ||
const value = file || doc | ||
/** @type {string|null} */ | ||
this.doc = value ? String(value) : null | ||
} | ||
// Run transform plugins for `key` on `nodes`. | ||
run(key, nodes) { | ||
const wareKey = key + 'Plugins' | ||
const plugins = this[wareKey] | ||
let index = -1 | ||
if (plugins) { | ||
while (plugins[++index]) { | ||
plugins[index](nodes) | ||
} | ||
} | ||
return nodes | ||
/** @type {Array<(node: Root) => void>} */ | ||
this.tokenizeRootPlugins = [...this.tokenizeRootPlugins] | ||
/** @type {Array<(node: Paragraph) => void>} */ | ||
this.tokenizeParagraphPlugins = [...this.tokenizeParagraphPlugins] | ||
/** @type {Array<(node: Sentence) => void>} */ | ||
this.tokenizeSentencePlugins = [...this.tokenizeSentencePlugins] | ||
} | ||
// Easy access to the document parser. This additionally supports retext-style | ||
// invocation: where an instance is created for each file, and the file is given | ||
// on construction. | ||
/** | ||
* Easy access to the document parser. | ||
* This additionally supports `retext`-like call: where an instance is | ||
* created for each file, and the file is given on construction. | ||
* | ||
* @param {string|undefined|null} [value] | ||
* @returns {Root} | ||
*/ | ||
parse(value) { | ||
@@ -63,259 +75,164 @@ return this.tokenizeRoot(value || this.doc) | ||
// Transform a `value` into a list of `NLCSTNode`s. | ||
tokenize(value) { | ||
const tokens = [] | ||
/** | ||
* Parse as a root. | ||
* | ||
* @param {string|undefined|null} [value] | ||
* @returns {Root} | ||
*/ | ||
tokenizeRoot(value) { | ||
const paragraph = this.tokenizeParagraph(value) | ||
/** @type {Root} */ | ||
const result = { | ||
type: 'RootNode', | ||
children: splitNode(paragraph, 'WhiteSpaceNode', newLine) | ||
} | ||
if (value === null || value === undefined) { | ||
value = '' | ||
} else if (value instanceof String) { | ||
value = value.toString() | ||
let index = -1 | ||
while (this.tokenizeRootPlugins[++index]) { | ||
this.tokenizeRootPlugins[index](result) | ||
} | ||
if (typeof value !== 'string') { | ||
// Return the given nodes if this is either an empty array, or an array with | ||
// a node as a first child. | ||
if ('length' in value && (!value[0] || value[0].type)) { | ||
return value | ||
} | ||
return result | ||
} | ||
throw new Error( | ||
"Illegal invocation: '" + | ||
value + | ||
"' is not a valid argument for 'ParseLatin'" | ||
) | ||
/** | ||
* Parse as a paragraph. | ||
* | ||
* @param {string|undefined|null} [value] | ||
* @returns {Paragraph} | ||
*/ | ||
tokenizeParagraph(value) { | ||
const sentence = this.tokenizeSentence(value) | ||
/** @type {Paragraph} */ | ||
const result = { | ||
type: 'ParagraphNode', | ||
children: splitNode(sentence, 'PunctuationNode', terminalMarker) | ||
} | ||
if (!value) { | ||
return tokens | ||
let index = -1 | ||
while (this.tokenizeParagraphPlugins[++index]) { | ||
this.tokenizeParagraphPlugins[index](result) | ||
} | ||
// Eat mechanism to use. | ||
const eater = this.position ? eat : noPositionEat | ||
return result | ||
} | ||
let index = 0 | ||
let offset = 0 | ||
let line = 1 | ||
let column = 1 | ||
let previous = '' | ||
let queue = '' | ||
let left | ||
let right | ||
let character | ||
/** | ||
* Parse as a sentence. | ||
* | ||
* @param {string|undefined|null} [value] | ||
* @returns {Sentence} | ||
*/ | ||
tokenizeSentence(value) { | ||
const children = this.tokenize(value) | ||
/** @type {Sentence} */ | ||
const result = {type: 'SentenceNode', children} | ||
while (index < value.length) { | ||
character = value.charAt(index) | ||
let index = -1 | ||
while (this.tokenizeSentencePlugins[++index]) { | ||
this.tokenizeSentencePlugins[index](result) | ||
} | ||
if (whiteSpace.test(character)) { | ||
right = 'WhiteSpace' | ||
} else if (punctuation.test(character)) { | ||
right = 'Punctuation' | ||
} else if (word.test(character)) { | ||
right = 'Word' | ||
} else { | ||
right = 'Symbol' | ||
} | ||
return result | ||
} | ||
tick.call(this) | ||
/** | ||
* Transform a `value` into a list of `NLCSTNode`s. | ||
* | ||
* @param {string|undefined|null} [value] | ||
* @returns {Array<SentenceContent>} | ||
*/ | ||
tokenize(value) { | ||
/** @type {Array<SentenceContent>} */ | ||
const children = [] | ||
previous = character | ||
character = '' | ||
left = right | ||
right = null | ||
index++ | ||
if (!value) { | ||
return children | ||
} | ||
tick.call(this) | ||
const currentPoint = {line: 1, column: 1, offset: 0} | ||
let from = 0 | ||
let index = 0 | ||
let start = {...currentPoint} | ||
/** @type {SentenceContent['type']|undefined} */ | ||
let previousType | ||
/** @type {string|undefined} */ | ||
let previous | ||
return tokens | ||
while (index < value.length) { | ||
const current = value.charAt(index) | ||
const currentType = whiteSpace.test(current) | ||
? 'WhiteSpaceNode' | ||
: punctuation.test(current) | ||
? 'PunctuationNode' | ||
: word.test(current) | ||
? 'WordNode' | ||
: 'SymbolNode' | ||
// Check one character. | ||
function tick() { | ||
if ( | ||
left === right && | ||
(left === 'Word' || | ||
left === 'WhiteSpace' || | ||
character === previous || | ||
surrogates.test(character)) | ||
from < index && | ||
previousType && | ||
currentType && | ||
!( | ||
previousType === currentType && | ||
// Words or white space continue. | ||
(previousType === 'WordNode' || | ||
previousType === 'WhiteSpaceNode' || | ||
// Same character of punctuation or symbol also continues. | ||
current === previous || | ||
// Surrogates of punctuation or symbol also continue. | ||
surrogates.test(current)) | ||
) | ||
) { | ||
queue += character | ||
} else { | ||
// Flush the previous queue. | ||
if (queue) { | ||
this['tokenize' + left](queue, eater) | ||
} | ||
queue = character | ||
children.push(createNode(previousType, value.slice(from, index))) | ||
from = index | ||
start = {...currentPoint} | ||
} | ||
} | ||
// Remove `subvalue` from `value`. | ||
// Expects `subvalue` to be at the start from `value`, and applies no | ||
// validation. | ||
function eat(subvalue) { | ||
const pos = position() | ||
update(subvalue) | ||
return apply | ||
// Add the given arguments, add `position` to the returned node, and return | ||
// the node. | ||
function apply(...input) { | ||
return pos(add(...input)) | ||
if (current === '\r' || (current === '\n' && previous !== '\r')) { | ||
currentPoint.line++ | ||
currentPoint.column = 1 | ||
} else if (current !== '\n') { | ||
currentPoint.column++ | ||
} | ||
} | ||
// Remove `subvalue` from `value`. | ||
// Does not patch positional information. | ||
function noPositionEat() { | ||
return add | ||
currentPoint.offset++ | ||
previousType = currentType | ||
previous = current | ||
index++ | ||
} | ||
// Add mechanism. | ||
function add(node, parent) { | ||
if (parent) { | ||
parent.children.push(node) | ||
} else { | ||
tokens.push(node) | ||
} | ||
return node | ||
if (previousType && from < index) { | ||
children.push(createNode(previousType, value.slice(from, index))) | ||
} | ||
// Mark position and patch `node.position`. | ||
function position() { | ||
const before = now() | ||
return children | ||
// Add the position to a node. | ||
function patch(node) { | ||
node.position = new Position(before) | ||
return node | ||
} | ||
return patch | ||
/** | ||
* | ||
* @param {SentenceContent['type']} type | ||
* @param {string} value | ||
* @returns {SentenceContent} | ||
*/ | ||
function createNode(type, value) { | ||
return type === 'WordNode' | ||
? { | ||
type: 'WordNode', | ||
children: [ | ||
{ | ||
type: 'TextNode', | ||
value, | ||
position: {start, end: {...currentPoint}} | ||
} | ||
], | ||
position: {start, end: {...currentPoint}} | ||
} | ||
: {type, value, position: {start, end: {...currentPoint}}} | ||
} | ||
// Update line and column based on `value`. | ||
function update(subvalue) { | ||
let character = -1 | ||
let lastIndex = -1 | ||
offset += subvalue.length | ||
while (++character < subvalue.length) { | ||
if (subvalue.charAt(character) === '\n') { | ||
lastIndex = character | ||
line++ | ||
} | ||
} | ||
if (lastIndex < 0) { | ||
column += subvalue.length | ||
} else { | ||
column = subvalue.length - lastIndex | ||
} | ||
} | ||
// Store position information for a node. | ||
function Position(start) { | ||
this.start = start | ||
this.end = now() | ||
} | ||
// Get the current position. | ||
function now() { | ||
return {line, column, offset} | ||
} | ||
} | ||
} | ||
// Default position. | ||
ParseLatin.prototype.position = true | ||
// Create text nodes. | ||
ParseLatin.prototype.tokenizeSymbol = createTextFactory('Symbol') | ||
ParseLatin.prototype.tokenizeWhiteSpace = createTextFactory('WhiteSpace') | ||
ParseLatin.prototype.tokenizePunctuation = createTextFactory('Punctuation') | ||
ParseLatin.prototype.tokenizeSource = createTextFactory('Source') | ||
ParseLatin.prototype.tokenizeText = createTextFactory('Text') | ||
// Inject `plugins` to modifiy the result of the method at `key` on the operated | ||
// on context. | ||
ParseLatin.prototype.use = useFactory(function (context, key, plugins) { | ||
context[key] = context[key].concat(plugins) | ||
}) | ||
// Inject `plugins` to modifiy the result of the method at `key` on the operated | ||
// on context, before any other. | ||
ParseLatin.prototype.useFirst = useFactory(function (context, key, plugins) { | ||
context[key] = plugins.concat(context[key]) | ||
}) | ||
// PARENT NODES | ||
// | ||
// All these nodes are `pluggable`: they come with a `use` method which accepts | ||
// a plugin (`function(NLCSTNode)`). | ||
// Every time one of these methods are called, the plugin is invoked with the | ||
// node, allowing for easy modification. | ||
// | ||
// In fact, the internal transformation from `tokenize` (a list of words, white | ||
// space, punctuation, and symbols) to `tokenizeRoot` (an NLCST tree), is also | ||
// implemented through this mechanism. | ||
// Create a `WordNode` with its children set to a single `TextNode`, its value | ||
// set to the given `value`. | ||
pluggable(ParseLatin, 'tokenizeWord', function (value, eat) { | ||
const add = (eat || noopEat)('') | ||
const parent = {type: 'WordNode', children: []} | ||
this.tokenizeText(value, eat, parent) | ||
return add(parent) | ||
}) | ||
// Create a `SentenceNode` with its children set to `Node`s, their values set | ||
// to the tokenized given `value`. | ||
// | ||
// Unless plugins add new nodes, the sentence is populated by `WordNode`s, | ||
// `SymbolNode`s, `PunctuationNode`s, and `WhiteSpaceNode`s. | ||
pluggable( | ||
ParseLatin, | ||
'tokenizeSentence', | ||
parserFactory({type: 'SentenceNode', tokenizer: 'tokenize'}) | ||
) | ||
// Create a `ParagraphNode` with its children set to `Node`s, their values set | ||
// to the tokenized given `value`. | ||
// | ||
// Unless plugins add new nodes, the paragraph is populated by `SentenceNode`s | ||
// and `WhiteSpaceNode`s. | ||
pluggable( | ||
ParseLatin, | ||
'tokenizeParagraph', | ||
parserFactory({ | ||
type: 'ParagraphNode', | ||
delimiter: terminalMarker, | ||
delimiterType: 'PunctuationNode', | ||
tokenizer: 'tokenizeSentence' | ||
}) | ||
) | ||
// Create a `RootNode` with its children set to `Node`s, their values set to the | ||
// tokenized given `value`. | ||
pluggable( | ||
ParseLatin, | ||
'tokenizeRoot', | ||
parserFactory({ | ||
type: 'RootNode', | ||
delimiter: newLine, | ||
delimiterType: 'WhiteSpaceNode', | ||
tokenizer: 'tokenizeParagraph' | ||
}) | ||
) | ||
// PLUGINS | ||
ParseLatin.prototype.use('tokenizeSentence', [ | ||
/** List of transforms handling a sentence. */ | ||
ParseLatin.prototype.tokenizeSentencePlugins = [ | ||
mergeInitialWordSymbol, | ||
@@ -326,7 +243,7 @@ mergeFinalWordSymbol, | ||
mergeInitialisms, | ||
mergeWords, | ||
patchPosition | ||
]) | ||
] | ||
ParseLatin.prototype.use('tokenizeParagraph', [ | ||
/** List of transforms handling a paragraph. */ | ||
ParseLatin.prototype.tokenizeParagraphPlugins = [ | ||
mergeNonWordSentences, | ||
@@ -344,5 +261,6 @@ mergeAffixSymbol, | ||
patchPosition | ||
]) | ||
] | ||
ParseLatin.prototype.use('tokenizeRoot', [ | ||
/** List of transforms handling a root. */ | ||
ParseLatin.prototype.tokenizeRootPlugins = [ | ||
makeInitialWhiteSpaceSiblings, | ||
@@ -352,80 +270,48 @@ makeFinalWhiteSpaceSiblings, | ||
patchPosition | ||
]) | ||
] | ||
// TEXT NODES | ||
/** | ||
* A function that splits one node into several nodes. | ||
* | ||
* @template {Parent} TheNode | ||
* @param {TheNode} node | ||
* @param {RegExp} expression | ||
* @param {Content['type']} childType | ||
* @returns {Array<TheNode>} | ||
*/ | ||
function splitNode(node, childType, expression) { | ||
/** @type {Array<TheNode>} */ | ||
const result = [] | ||
let index = -1 | ||
let start = 0 | ||
// Factory to create a `Text`. | ||
function createTextFactory(type) { | ||
type += 'Node' | ||
while (++index < node.children.length) { | ||
const token = node.children[index] | ||
return createText | ||
if ( | ||
index === node.children.length - 1 || | ||
(token.type === childType && expression.test(toString(token))) | ||
) { | ||
/** @type {TheNode} */ | ||
// @ts-expect-error: fine | ||
const parent = { | ||
type: node.type, | ||
children: node.children.slice(start, index + 1) | ||
} | ||
// Construct a `Text` from a bound `type` | ||
function createText(value, eat, parent) { | ||
if (value === null || value === undefined) { | ||
value = '' | ||
} | ||
const first = node.children[start] | ||
const last = token | ||
if (first.position && last.position) { | ||
parent.position = { | ||
start: first.position.start, | ||
end: last.position.end | ||
} | ||
} | ||
return (eat || noopEat)(value)({type, value: String(value)}, parent) | ||
} | ||
} | ||
// Make a method “pluggable”. | ||
function pluggable(Constructor, key, callback) { | ||
// Set a pluggable version of `callback` on `Constructor`. | ||
Constructor.prototype[key] = function (...input) { | ||
return this.run(key, callback.apply(this, input)) | ||
} | ||
} | ||
// Factory to inject `plugins`. Takes `callback` for the actual inserting. | ||
function useFactory(callback) { | ||
return use | ||
// Validate if `plugins` can be inserted. | ||
// Invokes the bound `callback` to do the actual inserting. | ||
function use(key, plugins) { | ||
// Throw if the method is not pluggable. | ||
if (!(key in this)) { | ||
throw new Error( | ||
'Illegal Invocation: Unsupported `key` for ' + | ||
'`use(key, plugins)`. Make sure `key` is a ' + | ||
'supported function' | ||
) | ||
result.push(parent) | ||
start = index + 1 | ||
} | ||
// Fail silently when no plugins are given. | ||
if (!plugins) { | ||
return | ||
} | ||
const wareKey = key + 'Plugins' | ||
// Make sure `plugins` is a list. | ||
plugins = typeof plugins === 'function' ? [plugins] : plugins.concat() | ||
// Make sure `wareKey` exists. | ||
if (!this[wareKey]) { | ||
this[wareKey] = [] | ||
} | ||
// Invoke callback with the ware key and plugins. | ||
callback(this, wareKey, plugins) | ||
} | ||
} | ||
// Add mechanism used when text-tokenisers are called directly outside of the | ||
// `tokenize` function. | ||
function noopAdd(node, parent) { | ||
if (parent) { | ||
parent.children.push(node) | ||
} | ||
return node | ||
return result | ||
} | ||
// Eat and add mechanism without adding positional information, used when | ||
// text-tokenisers are called directly outside of the `tokenize` function. | ||
function noopEat() { | ||
return noopAdd | ||
} |
@@ -0,1 +1,6 @@ | ||
/** | ||
* @typedef {import('nlcst').Paragraph} Paragraph | ||
* @typedef {import('nlcst').Sentence} Sentence | ||
*/ | ||
import {toString} from 'nlcst-to-string' | ||
@@ -5,48 +10,51 @@ import {modifyChildren} from 'unist-util-modify-children' | ||
// Break a sentence if a white space with more than one new-line is found. | ||
export const breakImplicitSentences = modifyChildren(function ( | ||
child, | ||
index, | ||
parent | ||
) { | ||
if (child.type !== 'SentenceNode') { | ||
return | ||
} | ||
export const breakImplicitSentences = modifyChildren( | ||
/** | ||
* @type {import('unist-util-modify-children').Modifier<Paragraph>} | ||
*/ | ||
const children = child.children | ||
function (child, index, parent) { | ||
if (child.type !== 'SentenceNode') { | ||
return | ||
} | ||
// Ignore first and last child. | ||
let position = 0 | ||
const children = child.children | ||
while (++position < children.length - 1) { | ||
const node = children[position] | ||
// Ignore first and last child. | ||
let position = 0 | ||
if ( | ||
node.type !== 'WhiteSpaceNode' || | ||
toString(node).split(/\r\n|\r|\n/).length < 3 | ||
) { | ||
continue | ||
} | ||
while (++position < children.length - 1) { | ||
const node = children[position] | ||
child.children = children.slice(0, position) | ||
if ( | ||
node.type !== 'WhiteSpaceNode' || | ||
toString(node).split(/\r\n|\r|\n/).length < 3 | ||
) { | ||
continue | ||
} | ||
const insertion = { | ||
type: 'SentenceNode', | ||
children: children.slice(position + 1) | ||
} | ||
child.children = children.slice(0, position) | ||
const tail = children[position - 1] | ||
const head = children[position + 1] | ||
/** @type {Sentence} */ | ||
const insertion = { | ||
type: 'SentenceNode', | ||
children: children.slice(position + 1) | ||
} | ||
parent.children.splice(index + 1, 0, node, insertion) | ||
const tail = children[position - 1] | ||
const head = children[position + 1] | ||
if (child.position && tail.position && head.position) { | ||
const end = child.position.end | ||
parent.children.splice(index + 1, 0, node, insertion) | ||
child.position.end = tail.position.end | ||
if (child.position && tail.position && head.position) { | ||
const end = child.position.end | ||
insertion.position = {start: head.position.start, end} | ||
child.position.end = tail.position.end | ||
insertion.position = {start: head.position.start, end} | ||
} | ||
return index + 1 | ||
} | ||
return index + 1 | ||
} | ||
}) | ||
) |
@@ -0,1 +1,6 @@ | ||
/** | ||
* @typedef {import('nlcst').Root} Root | ||
* @typedef {import('nlcst').Paragraph} Paragraph | ||
*/ | ||
import {modifyChildren} from 'unist-util-modify-children' | ||
@@ -5,24 +10,25 @@ | ||
// paragraphs. | ||
export const makeFinalWhiteSpaceSiblings = modifyChildren(function ( | ||
child, | ||
index, | ||
parent | ||
) { | ||
const children = child.children | ||
export const makeFinalWhiteSpaceSiblings = modifyChildren( | ||
/** | ||
* @type {import('unist-util-modify-children').Modifier<Root | Paragraph>} | ||
*/ | ||
if ( | ||
children && | ||
children.length > 0 && | ||
children[children.length - 1].type === 'WhiteSpaceNode' | ||
) { | ||
parent.children.splice(index + 1, 0, child.children.pop()) | ||
const previous = children[children.length - 1] | ||
function (child, index, parent) { | ||
if ('children' in child) { | ||
const tail = child.children[child.children.length - 1] | ||
if (previous && previous.position && child.position) { | ||
child.position.end = previous.position.end | ||
if (tail && tail.type === 'WhiteSpaceNode') { | ||
child.children.pop() // Remove `tail`. | ||
parent.children.splice(index + 1, 0, tail) | ||
const previous = child.children[child.children.length - 1] | ||
if (previous && previous.position && child.position) { | ||
child.position.end = previous.position.end | ||
} | ||
// Next, iterate over the current node again. | ||
return index | ||
} | ||
} | ||
// Next, iterate over the current node again. | ||
return index | ||
} | ||
}) | ||
) |
@@ -0,1 +1,6 @@ | ||
/** | ||
* @typedef {import('nlcst').Root} Root | ||
* @typedef {import('nlcst').Paragraph} Paragraph | ||
*/ | ||
import {visitChildren} from 'unist-util-visit-children' | ||
@@ -5,20 +10,20 @@ | ||
// sentences. | ||
export const makeInitialWhiteSpaceSiblings = visitChildren(function ( | ||
child, | ||
index, | ||
parent | ||
) { | ||
const children = child.children | ||
if ( | ||
children && | ||
children.length > 0 && | ||
children[0].type === 'WhiteSpaceNode' | ||
) { | ||
parent.children.splice(index, 0, children.shift()) | ||
const next = children[0] | ||
export const makeInitialWhiteSpaceSiblings = visitChildren( | ||
/** | ||
* @type {import('unist-util-visit-children').Visitor<Paragraph|Root>} | ||
*/ | ||
function (child, index, parent) { | ||
if ('children' in child && child.children) { | ||
const head = child.children[0] | ||
if (head && head.type === 'WhiteSpaceNode') { | ||
child.children.shift() | ||
parent.children.splice(index, 0, head) | ||
const next = child.children[0] | ||
if (next && next.position && child.position) { | ||
child.position.start = next.position.start | ||
if (next && next.position && child.position) { | ||
child.position.start = next.position.start | ||
} | ||
} | ||
} | ||
} | ||
}) | ||
) |
@@ -0,1 +1,5 @@ | ||
/** | ||
* @typedef {import('nlcst').Paragraph} Paragraph | ||
*/ | ||
import {toString} from 'nlcst-to-string' | ||
@@ -6,43 +10,46 @@ import {modifyChildren} from 'unist-util-modify-children' | ||
// comma. | ||
export const mergeAffixExceptions = modifyChildren(function ( | ||
child, | ||
index, | ||
parent | ||
) { | ||
const children = child.children | ||
export const mergeAffixExceptions = modifyChildren( | ||
/** | ||
* @type {import('unist-util-modify-children').Modifier<Paragraph>} | ||
*/ | ||
function (child, index, parent) { | ||
const previous = parent.children[index - 1] | ||
if (!children || children.length === 0 || index < 1) { | ||
return | ||
} | ||
if ( | ||
previous && | ||
'children' in previous && | ||
'children' in child && | ||
child.children.length > 0 | ||
) { | ||
let position = -1 | ||
let position = -1 | ||
while (child.children[++position]) { | ||
const node = child.children[position] | ||
while (children[++position]) { | ||
const node = children[position] | ||
if (node.type === 'WordNode') { | ||
return | ||
} | ||
if (node.type === 'WordNode') { | ||
return | ||
} | ||
if (node.type === 'SymbolNode' || node.type === 'PunctuationNode') { | ||
const value = toString(node) | ||
if (node.type === 'SymbolNode' || node.type === 'PunctuationNode') { | ||
const value = toString(node) | ||
if (value !== ',' && value !== ';') { | ||
return | ||
} | ||
if (value !== ',' && value !== ';') { | ||
return | ||
} | ||
previous.children.push(...child.children) | ||
const previousChild = parent.children[index - 1] | ||
previousChild.children = previousChild.children.concat(children) | ||
// Update position. | ||
if (previous.position && child.position) { | ||
previous.position.end = child.position.end | ||
} | ||
// Update position. | ||
if (previousChild.position && child.position) { | ||
previousChild.position.end = child.position.end | ||
parent.children.splice(index, 1) | ||
// Next, iterate over the node *now* at the current position. | ||
return index | ||
} | ||
} | ||
parent.children.splice(index, 1) | ||
// Next, iterate over the node *now* at the current position. | ||
return index | ||
} | ||
} | ||
}) | ||
) |
@@ -0,1 +1,5 @@ | ||
/** | ||
* @typedef {import('nlcst').Paragraph} Paragraph | ||
*/ | ||
import {toString} from 'nlcst-to-string' | ||
@@ -11,29 +15,35 @@ import {modifyChildren} from 'unist-util-modify-children' | ||
// sentence) to the previous sentence. | ||
export const mergeAffixSymbol = modifyChildren(function (child, index, parent) { | ||
const children = child.children | ||
export const mergeAffixSymbol = modifyChildren( | ||
/** | ||
* @type {import('unist-util-modify-children').Modifier<Paragraph>} | ||
*/ | ||
function (child, index, parent) { | ||
if ('children' in child && child.children.length > 0 && index > 0) { | ||
const previous = parent.children[index - 1] | ||
const first = child.children[0] | ||
const second = child.children[1] | ||
if (children && children.length > 0 && index > 0) { | ||
const first = children[0] | ||
const second = children[1] | ||
const previous = parent.children[index - 1] | ||
if ( | ||
previous && | ||
previous.type === 'SentenceNode' && | ||
(first.type === 'SymbolNode' || first.type === 'PunctuationNode') && | ||
affixSymbol.test(toString(first)) | ||
) { | ||
child.children.shift() // Remove `first`. | ||
previous.children.push(first) | ||
if ( | ||
(first.type === 'SymbolNode' || first.type === 'PunctuationNode') && | ||
affixSymbol.test(toString(first)) | ||
) { | ||
previous.children.push(children.shift()) | ||
// Update position. | ||
if (first.position && previous.position) { | ||
previous.position.end = first.position.end | ||
} | ||
// Update position. | ||
if (first.position && previous.position) { | ||
previous.position.end = first.position.end | ||
} | ||
if (second && second.position && child.position) { | ||
child.position.start = second.position.start | ||
} | ||
if (second && second.position && child.position) { | ||
child.position.start = second.position.start | ||
// Next, iterate over the previous node again. | ||
return index - 1 | ||
} | ||
// Next, iterate over the previous node again. | ||
return index - 1 | ||
} | ||
} | ||
}) | ||
) |
@@ -0,1 +1,5 @@ | ||
/** | ||
* @typedef {import('nlcst').Sentence} Sentence | ||
*/ | ||
import {toString} from 'nlcst-to-string' | ||
@@ -5,37 +9,38 @@ import {modifyChildren} from 'unist-util-modify-children' | ||
// Merge certain punctuation marks into their preceding words. | ||
export const mergeFinalWordSymbol = modifyChildren(function ( | ||
child, | ||
index, | ||
parent | ||
) { | ||
if ( | ||
index > 0 && | ||
(child.type === 'SymbolNode' || child.type === 'PunctuationNode') && | ||
toString(child) === '-' | ||
) { | ||
const children = parent.children | ||
const previous = children[index - 1] | ||
const next = children[index + 1] | ||
export const mergeFinalWordSymbol = modifyChildren( | ||
/** | ||
* @type {import('unist-util-modify-children').Modifier<Sentence>} | ||
*/ | ||
function (child, index, parent) { | ||
if ( | ||
(!next || next.type !== 'WordNode') && | ||
previous && | ||
previous.type === 'WordNode' | ||
index > 0 && | ||
(child.type === 'SymbolNode' || child.type === 'PunctuationNode') && | ||
toString(child) === '-' | ||
) { | ||
// Remove `child` from parent. | ||
children.splice(index, 1) | ||
const children = parent.children | ||
const previous = children[index - 1] | ||
const next = children[index + 1] | ||
// Add the punctuation mark at the end of the previous node. | ||
previous.children.push(child) | ||
if ( | ||
(!next || next.type !== 'WordNode') && | ||
previous && | ||
previous.type === 'WordNode' | ||
) { | ||
// Remove `child` from parent. | ||
children.splice(index, 1) | ||
// Update position. | ||
if (previous.position && child.position) { | ||
previous.position.end = child.position.end | ||
// Add the punctuation mark at the end of the previous node. | ||
previous.children.push(child) | ||
// Update position. | ||
if (previous.position && child.position) { | ||
previous.position.end = child.position.end | ||
} | ||
// Next, iterate over the node *now* at the current position (which was | ||
// the next node). | ||
return index | ||
} | ||
// Next, iterate over the node *now* at the current position (which was | ||
// the next node). | ||
return index | ||
} | ||
} | ||
}) | ||
) |
@@ -0,1 +1,5 @@ | ||
/** | ||
* @typedef {import('nlcst').Paragraph} Paragraph | ||
*/ | ||
import {toString} from 'nlcst-to-string' | ||
@@ -7,29 +11,30 @@ import {modifyChildren} from 'unist-util-modify-children' | ||
// lower case letter. | ||
export const mergeInitialDigitSentences = modifyChildren(function ( | ||
child, | ||
index, | ||
parent | ||
) { | ||
const children = child.children | ||
const siblings = parent.children | ||
const previous = siblings[index - 1] | ||
const head = children[0] | ||
export const mergeInitialDigitSentences = modifyChildren( | ||
/** | ||
* @type {import('unist-util-modify-children').Modifier<Paragraph>} | ||
*/ | ||
function (child, index, parent) { | ||
const previous = parent.children[index - 1] | ||
if ( | ||
previous && | ||
head && | ||
head.type === 'WordNode' && | ||
digitStart.test(toString(head)) | ||
) { | ||
previous.children = previous.children.concat(children) | ||
siblings.splice(index, 1) | ||
if ( | ||
previous && | ||
previous.type === 'SentenceNode' && | ||
child.type === 'SentenceNode' | ||
) { | ||
const head = child.children[0] | ||
// Update position. | ||
if (previous.position && child.position) { | ||
previous.position.end = child.position.end | ||
if (head && head.type === 'WordNode' && digitStart.test(toString(head))) { | ||
previous.children.push(...child.children) | ||
parent.children.splice(index, 1) | ||
// Update position. | ||
if (previous.position && child.position) { | ||
previous.position.end = child.position.end | ||
} | ||
// Next, iterate over the node *now* at the current position. | ||
return index | ||
} | ||
} | ||
// Next, iterate over the node *now* at the current position. | ||
return index | ||
} | ||
}) | ||
) |
@@ -0,1 +1,5 @@ | ||
/** | ||
* @typedef {import('nlcst').Paragraph} Paragraph | ||
*/ | ||
import {toString} from 'nlcst-to-string' | ||
@@ -9,41 +13,42 @@ import {modifyChildren} from 'unist-util-modify-children' | ||
// lower case letter. | ||
export const mergeInitialLowerCaseLetterSentences = modifyChildren(function ( | ||
child, | ||
index, | ||
parent | ||
) { | ||
const children = child.children | ||
export const mergeInitialLowerCaseLetterSentences = modifyChildren( | ||
/** | ||
* @type {import('unist-util-modify-children').Modifier<Paragraph>} | ||
*/ | ||
function (child, index, parent) { | ||
if (child.type === 'SentenceNode' && index > 0) { | ||
const previous = parent.children[index - 1] | ||
const children = child.children | ||
if (children && children.length > 0 && index > 0) { | ||
let position = -1 | ||
if (children.length > 0 && previous.type === 'SentenceNode') { | ||
let position = -1 | ||
while (children[++position]) { | ||
const node = children[position] | ||
while (children[++position]) { | ||
const node = children[position] | ||
if (node.type === 'WordNode') { | ||
if (!lowerInitial.test(toString(node))) { | ||
return | ||
} | ||
if (node.type === 'WordNode') { | ||
if (!lowerInitial.test(toString(node))) { | ||
return | ||
} | ||
const siblings = parent.children | ||
const previous = siblings[index - 1] | ||
previous.children.push(...children) | ||
previous.children = previous.children.concat(children) | ||
parent.children.splice(index, 1) | ||
siblings.splice(index, 1) | ||
// Update position. | ||
if (previous.position && child.position) { | ||
previous.position.end = child.position.end | ||
} | ||
// Update position. | ||
if (previous.position && child.position) { | ||
previous.position.end = child.position.end | ||
// Next, iterate over the node *now* at the current position. | ||
return index | ||
} | ||
if (node.type === 'SymbolNode' || node.type === 'PunctuationNode') { | ||
return | ||
} | ||
} | ||
// Next, iterate over the node *now* at the current position. | ||
return index | ||
} | ||
if (node.type === 'SymbolNode' || node.type === 'PunctuationNode') { | ||
return | ||
} | ||
} | ||
} | ||
}) | ||
) |
@@ -0,1 +1,5 @@ | ||
/** | ||
* @typedef {import('nlcst').Sentence} Sentence | ||
*/ | ||
import {toString} from 'nlcst-to-string' | ||
@@ -5,39 +9,40 @@ import {modifyChildren} from 'unist-util-modify-children' | ||
// Merge certain punctuation marks into their following words. | ||
export const mergeInitialWordSymbol = modifyChildren(function ( | ||
child, | ||
index, | ||
parent | ||
) { | ||
if ( | ||
(child.type !== 'SymbolNode' && child.type !== 'PunctuationNode') || | ||
toString(child) !== '&' | ||
) { | ||
return | ||
} | ||
export const mergeInitialWordSymbol = modifyChildren( | ||
/** | ||
* @type {import('unist-util-modify-children').Modifier<Sentence>} | ||
*/ | ||
function (child, index, parent) { | ||
if ( | ||
(child.type !== 'SymbolNode' && child.type !== 'PunctuationNode') || | ||
toString(child) !== '&' | ||
) { | ||
return | ||
} | ||
const children = parent.children | ||
const next = children[index + 1] | ||
const children = parent.children | ||
const next = children[index + 1] | ||
// If either a previous word, or no following word, exists, exit early. | ||
if ( | ||
(index > 0 && children[index - 1].type === 'WordNode') || | ||
!(next && next.type === 'WordNode') | ||
) { | ||
return | ||
} | ||
// If either a previous word, or no following word, exists, exit early. | ||
if ( | ||
(index > 0 && children[index - 1].type === 'WordNode') || | ||
!(next && next.type === 'WordNode') | ||
) { | ||
return | ||
} | ||
// Remove `child` from parent. | ||
children.splice(index, 1) | ||
// Remove `child` from parent. | ||
children.splice(index, 1) | ||
// Add the punctuation mark at the start of the next node. | ||
next.children.unshift(child) | ||
// Add the punctuation mark at the start of the next node. | ||
next.children.unshift(child) | ||
// Update position. | ||
if (next.position && child.position) { | ||
next.position.start = child.position.start | ||
// Update position. | ||
if (next.position && child.position) { | ||
next.position.start = child.position.start | ||
} | ||
// Next, iterate over the node at the previous position, as it's now adjacent | ||
// to a following word. | ||
return index - 1 | ||
} | ||
// Next, iterate over the node at the previous position, as it's now adjacent | ||
// to a following word. | ||
return index - 1 | ||
}) | ||
) |
@@ -0,1 +1,5 @@ | ||
/** | ||
* @typedef {import('nlcst').Sentence} Sentence | ||
*/ | ||
import {toString} from 'nlcst-to-string' | ||
@@ -6,58 +10,64 @@ import {modifyChildren} from 'unist-util-modify-children' | ||
// Merge initialisms. | ||
export const mergeInitialisms = modifyChildren(function (child, index, parent) { | ||
if (index > 0 && toString(child) === '.') { | ||
const siblings = parent.children | ||
const previous = siblings[index - 1] | ||
const children = previous.children | ||
export const mergeInitialisms = modifyChildren( | ||
/** | ||
* @type {import('unist-util-modify-children').Modifier<Sentence>} | ||
*/ | ||
function (child, index, parent) { | ||
if ( | ||
previous.type === 'WordNode' && | ||
children && | ||
children.length !== 1 && | ||
children.length % 2 !== 0 | ||
index > 0 && | ||
child.type === 'PunctuationNode' && | ||
toString(child) === '.' | ||
) { | ||
let position = children.length | ||
let isAllDigits = true | ||
const previous = parent.children[index - 1] | ||
while (children[--position]) { | ||
const otherChild = children[position] | ||
if ( | ||
previous.type === 'WordNode' && | ||
previous.children && | ||
previous.children.length !== 1 && | ||
previous.children.length % 2 !== 0 | ||
) { | ||
let position = previous.children.length | ||
let isAllDigits = true | ||
const value = toString(otherChild) | ||
while (previous.children[--position]) { | ||
const otherChild = previous.children[position] | ||
if (position % 2 === 0) { | ||
// Initialisms consist of one character values. | ||
if (value.length > 1) { | ||
return | ||
} | ||
const value = toString(otherChild) | ||
if (!numerical.test(value)) { | ||
isAllDigits = false | ||
if (position % 2 === 0) { | ||
// Initialisms consist of one character values. | ||
if (value.length > 1) { | ||
return | ||
} | ||
if (!numerical.test(value)) { | ||
isAllDigits = false | ||
} | ||
} else if (value !== '.') { | ||
if (position < previous.children.length - 2) { | ||
break | ||
} else { | ||
return | ||
} | ||
} | ||
} else if (value !== '.') { | ||
if (position < children.length - 2) { | ||
break | ||
} else { | ||
return | ||
} | ||
} | ||
} | ||
if (!isAllDigits) { | ||
// Remove `child` from parent. | ||
siblings.splice(index, 1) | ||
if (!isAllDigits) { | ||
// Remove `child` from parent. | ||
parent.children.splice(index, 1) | ||
// Add child to the previous children. | ||
children.push(child) | ||
// Add child to the previous children. | ||
previous.children.push(child) | ||
// Update position. | ||
if (previous.position && child.position) { | ||
previous.position.end = child.position.end | ||
// Update position. | ||
if (previous.position && child.position) { | ||
previous.position.end = child.position.end | ||
} | ||
// Next, iterate over the node *now* at the current position. | ||
return index | ||
} | ||
// Next, iterate over the node *now* at the current position. | ||
return index | ||
} | ||
} | ||
} | ||
}) | ||
) |
@@ -0,50 +1,57 @@ | ||
/** | ||
* @typedef {import('nlcst').SentenceContent} SentenceContent | ||
* @typedef {import('nlcst').WordContent} WordContent | ||
* @typedef {import('nlcst').Sentence} Sentence | ||
*/ | ||
import {toString} from 'nlcst-to-string' | ||
import {modifyChildren} from 'unist-util-modify-children' | ||
const slash = '/' | ||
// Merge words joined by certain punctuation marks. | ||
export const mergeInnerWordSlash = modifyChildren(function ( | ||
child, | ||
index, | ||
parent | ||
) { | ||
const siblings = parent.children | ||
const previous = siblings[index - 1] | ||
const next = siblings[index + 1] | ||
export const mergeInnerWordSlash = modifyChildren( | ||
/** | ||
* @type {import('unist-util-modify-children').Modifier<Sentence>} | ||
*/ | ||
function (child, index, parent) { | ||
const siblings = parent.children | ||
const previous = siblings[index - 1] | ||
if ( | ||
previous && | ||
previous.type === 'WordNode' && | ||
(child.type === 'SymbolNode' || child.type === 'PunctuationNode') && | ||
toString(child) === slash | ||
) { | ||
const previousValue = toString(previous) | ||
let tail = child | ||
let queue = [child] | ||
let count = 1 | ||
let nextValue = '' | ||
if ( | ||
previous && | ||
previous.type === 'WordNode' && | ||
(child.type === 'SymbolNode' || child.type === 'PunctuationNode') && | ||
toString(child) === '/' | ||
) { | ||
const previousValue = toString(previous) | ||
/** @type {SentenceContent} */ | ||
let tail = child | ||
/** @type {Array<WordContent>} */ | ||
const queue = [child] | ||
let count = 1 | ||
let nextValue = '' | ||
const next = siblings[index + 1] | ||
if (next && next.type === 'WordNode') { | ||
nextValue = toString(next) | ||
tail = next | ||
queue = queue.concat(next.children) | ||
count++ | ||
} | ||
if (next && next.type === 'WordNode') { | ||
nextValue = toString(next) | ||
tail = next | ||
queue.push(...next.children) | ||
count++ | ||
} | ||
if (previousValue.length < 3 && (!nextValue || nextValue.length < 3)) { | ||
// Add all found tokens to `prev`s children. | ||
previous.children = previous.children.concat(queue) | ||
if (previousValue.length < 3 && (!nextValue || nextValue.length < 3)) { | ||
// Add all found tokens to `prev`s children. | ||
previous.children.push(...queue) | ||
siblings.splice(index, count) | ||
siblings.splice(index, count) | ||
// Update position. | ||
if (previous.position && tail.position) { | ||
previous.position.end = tail.position.end | ||
// Update position. | ||
if (previous.position && tail.position) { | ||
previous.position.end = tail.position.end | ||
} | ||
// Next, iterate over the node *now* at the current position. | ||
return index | ||
} | ||
// Next, iterate over the node *now* at the current position. | ||
return index | ||
} | ||
} | ||
}) | ||
) |
@@ -0,1 +1,6 @@ | ||
/** | ||
* @typedef {import('nlcst').Sentence} Sentence | ||
* @typedef {import('nlcst').WordContent} WordContent | ||
*/ | ||
import {toString} from 'nlcst-to-string' | ||
@@ -8,66 +13,69 @@ import {modifyChildren} from 'unist-util-modify-children' | ||
// Merge words joined by certain punctuation marks. | ||
export const mergeInnerWordSymbol = modifyChildren(function ( | ||
child, | ||
index, | ||
parent | ||
) { | ||
if ( | ||
index > 0 && | ||
(child.type === 'SymbolNode' || child.type === 'PunctuationNode') | ||
) { | ||
const siblings = parent.children | ||
const previous = siblings[index - 1] | ||
export const mergeInnerWordSymbol = modifyChildren( | ||
/** | ||
* @type {import('unist-util-modify-children').Modifier<Sentence>} | ||
*/ | ||
function (child, index, parent) { | ||
if ( | ||
index > 0 && | ||
(child.type === 'SymbolNode' || child.type === 'PunctuationNode') | ||
) { | ||
const siblings = parent.children | ||
const previous = siblings[index - 1] | ||
if (previous && previous.type === 'WordNode') { | ||
let position = index - 1 | ||
let tokens = [] | ||
let queue = [] | ||
if (previous && previous.type === 'WordNode') { | ||
let position = index - 1 | ||
/** @type {Array<WordContent>} */ | ||
const tokens = [] | ||
/** @type {Array<WordContent>} */ | ||
let queue = [] | ||
// - If a token which is neither word nor inner word symbol is found, | ||
// the loop is broken | ||
// - If an inner word symbol is found, it’s queued | ||
// - If a word is found, it’s queued (and the queue stored and emptied) | ||
while (siblings[++position]) { | ||
const sibling = siblings[position] | ||
// - If a token which is neither word nor inner word symbol is found, | ||
// the loop is broken | ||
// - If an inner word symbol is found, it’s queued | ||
// - If a word is found, it’s queued (and the queue stored and emptied) | ||
while (siblings[++position]) { | ||
const sibling = siblings[position] | ||
if (sibling.type === 'WordNode') { | ||
tokens = tokens.concat(queue, sibling.children) | ||
if (sibling.type === 'WordNode') { | ||
tokens.push(...queue, ...sibling.children) | ||
queue = [] | ||
} else if ( | ||
(sibling.type === 'SymbolNode' || | ||
sibling.type === 'PunctuationNode') && | ||
wordSymbolInner.test(toString(sibling)) | ||
) { | ||
queue.push(sibling) | ||
} else { | ||
break | ||
queue = [] | ||
} else if ( | ||
(sibling.type === 'SymbolNode' || | ||
sibling.type === 'PunctuationNode') && | ||
wordSymbolInner.test(toString(sibling)) | ||
) { | ||
queue.push(sibling) | ||
} else { | ||
break | ||
} | ||
} | ||
} | ||
if (tokens.length > 0) { | ||
// If there is a queue, remove its length from `position`. | ||
if (queue.length > 0) { | ||
position -= queue.length | ||
} | ||
if (tokens.length > 0) { | ||
// If there is a queue, remove its length from `position`. | ||
if (queue.length > 0) { | ||
position -= queue.length | ||
} | ||
// Remove every (one or more) inner-word punctuation marks and children | ||
// of words. | ||
siblings.splice(index, position - index) | ||
// Remove every (one or more) inner-word punctuation marks and children | ||
// of words. | ||
siblings.splice(index, position - index) | ||
// Add all found tokens to `prev`s children. | ||
previous.children = previous.children.concat(tokens) | ||
// Add all found tokens to `prev`s children. | ||
previous.children.push(...tokens) | ||
const last = tokens[tokens.length - 1] | ||
const last = tokens[tokens.length - 1] | ||
// Update position. | ||
if (previous.position && last.position) { | ||
previous.position.end = last.position.end | ||
// Update position. | ||
if (previous.position && last.position) { | ||
previous.position.end = last.position.end | ||
} | ||
// Next, iterate over the node *now* at the current position. | ||
return index | ||
} | ||
// Next, iterate over the node *now* at the current position. | ||
return index | ||
} | ||
} | ||
} | ||
}) | ||
) |
@@ -0,1 +1,5 @@ | ||
/** | ||
* @typedef {import('nlcst').Paragraph} Paragraph | ||
*/ | ||
import {modifyChildren} from 'unist-util-modify-children' | ||
@@ -5,47 +9,49 @@ | ||
// contain word tokens. | ||
export const mergeNonWordSentences = modifyChildren(function ( | ||
child, | ||
index, | ||
parent | ||
) { | ||
const children = child.children | ||
let position = -1 | ||
export const mergeNonWordSentences = modifyChildren( | ||
/** | ||
* @type {import('unist-util-modify-children').Modifier<Paragraph>} | ||
*/ | ||
function (child, index, parent) { | ||
if ('children' in child) { | ||
let position = -1 | ||
while (children[++position]) { | ||
if (children[position].type === 'WordNode') { | ||
return | ||
} | ||
} | ||
while (child.children[++position]) { | ||
if (child.children[position].type === 'WordNode') { | ||
return | ||
} | ||
} | ||
const previous = parent.children[index - 1] | ||
const previous = parent.children[index - 1] | ||
if (previous) { | ||
previous.children = previous.children.concat(children) | ||
if (previous && 'children' in previous) { | ||
previous.children.push(...child.children) | ||
// Remove the child. | ||
parent.children.splice(index, 1) | ||
// Remove the child. | ||
parent.children.splice(index, 1) | ||
// Patch position. | ||
if (previous.position && child.position) { | ||
previous.position.end = child.position.end | ||
} | ||
// Patch position. | ||
if (previous.position && child.position) { | ||
previous.position.end = child.position.end | ||
} | ||
// Next, iterate over the node *now* at the current position (which was the | ||
// next node). | ||
return index | ||
} | ||
// Next, iterate over the node *now* at the current position (which was the | ||
// next node). | ||
return index | ||
} | ||
const next = parent.children[index + 1] | ||
const next = parent.children[index + 1] | ||
if (next) { | ||
next.children = children.concat(next.children) | ||
if (next && 'children' in next) { | ||
next.children.unshift(...child.children) | ||
// Patch position. | ||
if (next.position && child.position) { | ||
next.position.start = child.position.start | ||
// Patch position. | ||
if (next.position && child.position) { | ||
next.position.start = child.position.start | ||
} | ||
// Remove the child. | ||
parent.children.splice(index, 1) | ||
} | ||
} | ||
// Remove the child. | ||
parent.children.splice(index, 1) | ||
} | ||
}) | ||
) |
@@ -0,1 +1,5 @@ | ||
/** | ||
* @typedef {import('nlcst').Paragraph} Paragraph | ||
*/ | ||
import {toString} from 'nlcst-to-string' | ||
@@ -26,44 +30,46 @@ import {modifyChildren} from 'unist-util-modify-children' | ||
// certain word. | ||
export const mergePrefixExceptions = modifyChildren(function ( | ||
child, | ||
index, | ||
parent | ||
) { | ||
const children = child.children | ||
export const mergePrefixExceptions = modifyChildren( | ||
/** | ||
* @type {import('unist-util-modify-children').Modifier<Paragraph>} | ||
*/ | ||
function (child, index, parent) { | ||
if ('children' in child && child.children.length > 1) { | ||
const period = child.children[child.children.length - 1] | ||
if (children && children.length > 1) { | ||
const period = children[children.length - 1] | ||
if (period && toString(period) === '.') { | ||
const node = children[children.length - 2] | ||
if ( | ||
node && | ||
node.type === 'WordNode' && | ||
abbreviationPrefix.test(toString(node).toLowerCase()) | ||
period && | ||
(period.type === 'PunctuationNode' || period.type === 'SymbolNode') && | ||
toString(period) === '.' | ||
) { | ||
// Merge period into abbreviation. | ||
node.children.push(period) | ||
children.pop() | ||
const node = child.children[child.children.length - 2] | ||
// Update position. | ||
if (period.position && node.position) { | ||
node.position.end = period.position.end | ||
} | ||
if ( | ||
node && | ||
node.type === 'WordNode' && | ||
abbreviationPrefix.test(toString(node).toLowerCase()) | ||
) { | ||
// Merge period into abbreviation. | ||
node.children.push(period) | ||
child.children.pop() | ||
// Merge sentences. | ||
const next = parent.children[index + 1] | ||
// Update position. | ||
if (period.position && node.position) { | ||
node.position.end = period.position.end | ||
} | ||
if (next) { | ||
child.children = children.concat(next.children) | ||
// Merge sentences. | ||
const next = parent.children[index + 1] | ||
parent.children.splice(index + 1, 1) | ||
if (next && next.type === 'SentenceNode') { | ||
child.children.push(...next.children) | ||
parent.children.splice(index + 1, 1) | ||
// Update position. | ||
if (next.position && child.position) { | ||
child.position.end = next.position.end | ||
// Update position. | ||
if (next.position && child.position) { | ||
child.position.end = next.position.end | ||
} | ||
// Next, iterate over the current node again. | ||
return index - 1 | ||
} | ||
// Next, iterate over the current node again. | ||
return index - 1 | ||
} | ||
@@ -73,2 +79,2 @@ } | ||
} | ||
}) | ||
) |
@@ -0,1 +1,5 @@ | ||
/** | ||
* @typedef {import('nlcst').Paragraph} Paragraph | ||
*/ | ||
import {toString} from 'nlcst-to-string' | ||
@@ -10,82 +14,88 @@ import {visitChildren} from 'unist-util-visit-children' | ||
// or the next word (if available). | ||
export const mergeRemainingFullStops = visitChildren(function (child) { | ||
const children = child.children | ||
let position = children.length | ||
let hasFoundDelimiter = false | ||
export const mergeRemainingFullStops = visitChildren( | ||
/** | ||
* @type {import('unist-util-visit-children').Visitor<Paragraph>} | ||
*/ | ||
// eslint-disable-next-line complexity | ||
function (child, _, _parent) { | ||
if ('children' in child) { | ||
let position = child.children.length | ||
let hasFoundDelimiter = false | ||
while (children[--position]) { | ||
const grandchild = children[position] | ||
while (child.children[--position]) { | ||
const grandchild = child.children[position] | ||
if ( | ||
grandchild.type !== 'SymbolNode' && | ||
grandchild.type !== 'PunctuationNode' | ||
) { | ||
// This is a sentence without terminal marker, so we 'fool' the code to | ||
// make it think we have found one. | ||
if (grandchild.type === 'WordNode') { | ||
hasFoundDelimiter = true | ||
} | ||
if ( | ||
grandchild.type !== 'SymbolNode' && | ||
grandchild.type !== 'PunctuationNode' | ||
) { | ||
// This is a sentence without terminal marker, so we 'fool' the code to | ||
// make it think we have found one. | ||
if (grandchild.type === 'WordNode') { | ||
hasFoundDelimiter = true | ||
} | ||
continue | ||
} | ||
continue | ||
} | ||
// Exit when this token is not a terminal marker. | ||
if (!terminalMarker.test(toString(grandchild))) { | ||
continue | ||
} | ||
// Exit when this token is not a terminal marker. | ||
if (!terminalMarker.test(toString(grandchild))) { | ||
continue | ||
} | ||
// Ignore the first terminal marker found (starting at the end), as it | ||
// should not be merged. | ||
if (!hasFoundDelimiter) { | ||
hasFoundDelimiter = true | ||
// Ignore the first terminal marker found (starting at the end), as it | ||
// should not be merged. | ||
if (!hasFoundDelimiter) { | ||
hasFoundDelimiter = true | ||
continue | ||
} | ||
continue | ||
} | ||
// Only merge a single full stop. | ||
if (toString(grandchild) !== '.') { | ||
continue | ||
} | ||
// Only merge a single full stop. | ||
if (toString(grandchild) !== '.') { | ||
continue | ||
} | ||
const previous = child.children[position - 1] | ||
const next = child.children[position + 1] | ||
const previous = children[position - 1] | ||
const next = children[position + 1] | ||
if (previous && previous.type === 'WordNode') { | ||
const nextNext = child.children[position + 2] | ||
if (previous && previous.type === 'WordNode') { | ||
const nextNext = children[position + 2] | ||
// Continue when the full stop is followed by a space and another full | ||
// stop, such as: `{.} .` | ||
if ( | ||
next && | ||
nextNext && | ||
next.type === 'WhiteSpaceNode' && | ||
toString(nextNext) === '.' | ||
) { | ||
continue | ||
} | ||
// Continue when the full stop is followed by a space and another full | ||
// stop, such as: `{.} .` | ||
if ( | ||
next && | ||
nextNext && | ||
next.type === 'WhiteSpaceNode' && | ||
toString(nextNext) === '.' | ||
) { | ||
continue | ||
} | ||
// Remove `child` from parent. | ||
child.children.splice(position, 1) | ||
// Remove `child` from parent. | ||
children.splice(position, 1) | ||
// Add the punctuation mark at the end of the previous node. | ||
previous.children.push(grandchild) | ||
// Add the punctuation mark at the end of the previous node. | ||
previous.children.push(grandchild) | ||
// Update position. | ||
if (grandchild.position && previous.position) { | ||
previous.position.end = grandchild.position.end | ||
} | ||
// Update position. | ||
if (grandchild.position && previous.position) { | ||
previous.position.end = grandchild.position.end | ||
} | ||
position-- | ||
} else if (next && next.type === 'WordNode') { | ||
// Remove `child` from parent. | ||
child.children.splice(position, 1) | ||
position-- | ||
} else if (next && next.type === 'WordNode') { | ||
// Remove `child` from parent. | ||
children.splice(position, 1) | ||
// Add the punctuation mark at the start of the next node. | ||
next.children.unshift(grandchild) | ||
// Add the punctuation mark at the start of the next node. | ||
next.children.unshift(grandchild) | ||
if (grandchild.position && next.position) { | ||
next.position.start = grandchild.position.start | ||
if (grandchild.position && next.position) { | ||
next.position.start = grandchild.position.start | ||
} | ||
} | ||
} | ||
} | ||
} | ||
}) | ||
) |
@@ -0,31 +1,47 @@ | ||
/** | ||
* @typedef {import('unist').Node} Node | ||
* @typedef {import('nlcst').Sentence} Sentence | ||
* @typedef {import('nlcst').Paragraph} Paragraph | ||
* @typedef {import('nlcst').Root} Root | ||
*/ | ||
import {visitChildren} from 'unist-util-visit-children' | ||
// Patch the position on a parent node based on its first and last child. | ||
export const patchPosition = visitChildren(function (child, index, node) { | ||
const siblings = node.children | ||
export const patchPosition = visitChildren( | ||
/** | ||
* @type {import('unist-util-visit-children').Visitor<Paragraph|Sentence|Root>} | ||
*/ | ||
function (child, index, node) { | ||
const siblings = node.children | ||
if (!child.position) { | ||
return | ||
} | ||
if ( | ||
index < 1 && | ||
/* c8 ignore next */ | ||
(!node.position || !node.position.start) | ||
) { | ||
patch(node) | ||
// @ts-expect-error: we just set it. | ||
node.position.start = child.position.start | ||
} | ||
if ( | ||
index < 1 && | ||
/* c8 ignore next */ | ||
(!node.position || !node.position.start) | ||
) { | ||
patch(node) | ||
node.position.start = child.position.start | ||
if ( | ||
index === siblings.length - 1 && | ||
(!node.position || !node.position.end) | ||
) { | ||
patch(node) | ||
// @ts-expect-error: we just set it. | ||
node.position.end = child.position.end | ||
} | ||
} | ||
) | ||
if (index === siblings.length - 1 && (!node.position || !node.position.end)) { | ||
patch(node) | ||
node.position.end = child.position.end | ||
} | ||
}) | ||
// Add a `position` object when it does not yet exist on `node`. | ||
/** | ||
* @param {Node} node | ||
*/ | ||
function patch(node) { | ||
if (!node.position) { | ||
// @ts-expect-error: fine. | ||
node.position = {} | ||
} | ||
} |
@@ -0,12 +1,23 @@ | ||
/** | ||
* @typedef {import('nlcst').Root} Root | ||
* @typedef {import('nlcst').Paragraph} Paragraph | ||
*/ | ||
import {modifyChildren} from 'unist-util-modify-children' | ||
// Remove empty children. | ||
export const removeEmptyNodes = modifyChildren(function (child, index, parent) { | ||
if ('children' in child && child.children.length === 0) { | ||
parent.children.splice(index, 1) | ||
export const removeEmptyNodes = modifyChildren( | ||
/** | ||
* @type {import('unist-util-modify-children').Modifier<Root | Paragraph>} | ||
*/ | ||
// Next, iterate over the node *now* at the current position (which was the | ||
// next node). | ||
return index | ||
function (child, index, parent) { | ||
if ('children' in child && child.children.length === 0) { | ||
parent.children.splice(index, 1) | ||
// Next, iterate over the node *now* at the current position (which was the | ||
// next node). | ||
return index | ||
} | ||
} | ||
}) | ||
) |
{ | ||
"name": "parse-latin", | ||
"version": "5.0.1", | ||
"version": "6.0.0", | ||
"description": "Latin-script (natural language) parser", | ||
@@ -27,16 +27,22 @@ "license": "MIT", | ||
"main": "index.js", | ||
"types": "index.d.ts", | ||
"files": [ | ||
"lib/", | ||
"index.d.ts", | ||
"index.js" | ||
], | ||
"dependencies": { | ||
"@types/nlcst": "^1.0.0", | ||
"@types/unist": "^2.0.0", | ||
"nlcst-to-string": "^3.0.0", | ||
"unist-util-modify-children": "^3.0.0", | ||
"unist-util-visit-children": "^2.0.0" | ||
"unist-util-visit-children": "^2.0.0", | ||
"vfile": "^5.0.0" | ||
}, | ||
"devDependencies": { | ||
"@types/node": "^18.0.0", | ||
"@types/regenerate": "^1.0.0", | ||
"@unicode/unicode-13.0.0": "^1.0.0", | ||
"c8": "^7.0.0", | ||
"is-hidden": "^2.0.0", | ||
"negate": "^1.0.0", | ||
"nlcst-test": "^3.0.0", | ||
@@ -48,9 +54,11 @@ "nyc": "^15.0.0", | ||
"remark-preset-wooorm": "^9.0.0", | ||
"type-coverage": "^2.0.0", | ||
"typescript": "^4.0.0", | ||
"unist-util-remove-position": "^4.0.0", | ||
"vfile": "^5.0.0", | ||
"xo": "^0.52.0" | ||
}, | ||
"scripts": { | ||
"prepack": "npm run generate && npm run format", | ||
"prepack": "npm run generate && npm run build && npm run format", | ||
"fixture": "node script/generate-fixture.js", | ||
"build": "tsc --build --clean && tsc --build && type-coverage", | ||
"generate": "node script/build-expressions.js", | ||
@@ -60,3 +68,3 @@ "format": "remark . -qfo && prettier . -w --loglevel warn && xo --fix", | ||
"test-coverage": "c8 --check-coverage --100 --reporter lcov npm run test-api", | ||
"test": "npm run generate && npm run format && npm run test-coverage" | ||
"test": "npm run generate && npm run build && npm run format && npm run test-coverage" | ||
}, | ||
@@ -82,3 +90,8 @@ "prettier": { | ||
] | ||
}, | ||
"typeCoverage": { | ||
"atLeast": 100, | ||
"detail": true, | ||
"strict": true | ||
} | ||
} |
173
readme.md
@@ -7,22 +7,50 @@ # parse-latin | ||
[![Size][size-badge]][size] | ||
[![Chat][chat-badge]][chat] | ||
A Latin-script language parser for [**retext**][retext] producing **[nlcst][]** | ||
nodes. | ||
A natural language parser, for Latin-script languages, that produces [nlcst][]. | ||
## Contents | ||
* [What is this?](#what-is-this) | ||
* [When should I use this?](#when-should-i-use-this) | ||
* [Install](#install) | ||
* [Use](#use) | ||
* [API](#api) | ||
* [`ParseLatin()`](#parselatin) | ||
* [Algorithm](#algorithm) | ||
* [Types](#types) | ||
* [Compatibility](#compatibility) | ||
* [Related](#related) | ||
* [Contribute](#contribute) | ||
* [Security](#security) | ||
* [License](#license) | ||
## What is this? | ||
This package exposes a parser that takes Latin-script natural language and | ||
produces a syntax tree. | ||
## When should I use this? | ||
If you want to handle natural language as syntax trees manually, use this. | ||
Alternatively, you can use the retext plugin [`retext-latin`][retext-latin], | ||
which wraps this project to also parse natural language at a higher-level | ||
(easier) abstraction. | ||
Whether Old-English (“þā gewearþ þǣm hlāforde and þǣm hȳrigmannum wiþ ānum | ||
penninge”), Icelandic (“Hvað er að frétta”), French (“Où sont les toilettes?”), | ||
`parse-latin` does a good job at tokenizing it. | ||
this project does a good job at tokenizing it. | ||
Note also that `parse-latin` does a decent job at tokenizing Latin-like scripts, | ||
Cyrillic (“Добро пожаловать!”), Georgian (“როგორა ხარ?”), Armenian (“Շատ հաճելի | ||
է”), and such. | ||
For English and Dutch, you can instead use [`parse-english`][parse-english] and | ||
[`parse-dutch`][parse-dutch]. | ||
You can somewhat use this for Latin-like scripts, such as Cyrillic | ||
(“Добро пожаловать!”), Georgian (“როგორა ხარ?”), Armenian (“Շատ հաճելի է”), | ||
and such. | ||
## Install | ||
This package is ESM only: Node 12+ is needed to use it and it must be `import`ed | ||
instead of `require`d. | ||
This package is [ESM only][esm]. | ||
In Node.js (version 14.14+, 16.0+), install with [npm][]: | ||
[npm][]: | ||
```sh | ||
@@ -32,2 +60,16 @@ npm install parse-latin | ||
In Deno with [`esm.sh`][esmsh]: | ||
```js | ||
import {ParseLatin} from 'https://esm.sh/parse-latin@6' | ||
``` | ||
In browsers with [`esm.sh`][esmsh]: | ||
```html | ||
<script type="module"> | ||
import {ParseLatin} from 'https://esm.sh/parse-latin@6?bundle' | ||
</script> | ||
``` | ||
## Use | ||
@@ -44,3 +86,3 @@ | ||
Which, when inspecting, yields: | ||
Yields: | ||
@@ -64,55 +106,76 @@ ```txt | ||
This package exports the following identifiers: `ParseLatin`. | ||
This package exports the identifier `ParseLatin`. | ||
There is no default export. | ||
### `ParseLatin(value)` | ||
### `ParseLatin()` | ||
Exposes the functionality needed to tokenize natural Latin-script languages into | ||
a syntax tree. | ||
If `value` is passed here, it’s not needed to give it to `#parse()`. | ||
Create a new parser. | ||
#### `ParseLatin#tokenize(value)` | ||
#### `ParseLatin#parse(value)` | ||
Tokenize `value` (`string`) into letters and numbers (words), white space, and | ||
everything else (punctuation). | ||
The returned nodes are a flat list without paragraphs or sentences. | ||
Turn natural language into a syntax tree. | ||
###### Returns | ||
##### Parameters | ||
[`Array.<Node>`][nlcst] — Nodes. | ||
###### `value` | ||
#### `ParseLatin#parse(value)` | ||
Value to parse (`string`). | ||
Tokenize `value` (`string`) into an [NLCST][] tree. | ||
The returned node is a `RootNode` with in it paragraphs and sentences. | ||
##### Returns | ||
###### Returns | ||
[`RootNode`][root]. | ||
[`Node`][nlcst] — Root node. | ||
## Algorithm | ||
> Note: The easiest way to see **how parse-latin tokenizes and parses**, is by | ||
> using the [online parser demo][demo], which | ||
> shows the syntax tree corresponding to the typed text. | ||
> 👉 **Note**: | ||
> The easiest way to see how `parse-latin` parses, is by using the | ||
> [online parser demo][demo], which shows the syntax tree corresponding to | ||
> the typed text. | ||
`parse-latin` splits text into white space, word, and punctuation tokens. | ||
`parse-latin` starts out with a pretty easy definition, one that most other | ||
tokenizers use: | ||
`parse-latin` splits text into white space, punctuation, symbol, and word | ||
tokens: | ||
* A “word” is one or more letter or number characters | ||
* A “white space” is one or more white space characters | ||
* A “punctuation” is one or more of anything else | ||
* “word” is one or more unicode letters or numbers | ||
* “white space” is one or more unicode white space characters | ||
* “punctuation” is one or more unicode punctuation characters | ||
* “symbol” is one or more of anything else | ||
Then, it manipulates and merges those tokens into a ([nlcst][]) syntax tree, | ||
adding sentences and paragraphs where needed. | ||
Then, it manipulates and merges those tokens into a syntax tree, adding | ||
sentences and paragraphs where needed. | ||
* Some punctuation marks are part of the word they occur in, such as | ||
* some punctuation marks are part of the word they occur in, such as | ||
`non-profit`, `she’s`, `G.I.`, `11:00`, `N/A`, `&c`, `nineteenth- and…` | ||
* Some full-stops do not mark a sentence end, such as `1.`, `e.g.`, `id.` | ||
* Although full-stops, question marks, and exclamation marks (sometimes) end a | ||
* some periods do not mark a sentence end, such as `1.`, `e.g.`, `id.` | ||
* although periods, question marks, and exclamation marks (sometimes) end a | ||
sentence, that end might not occur directly after the mark, such as `.)`, | ||
`."` | ||
* And many more exceptions | ||
* …and many more exceptions | ||
## Types | ||
This package is fully typed with [TypeScript][]. | ||
It exports no additional types. | ||
## Compatibility | ||
This package is at least compatible with all maintained versions of Node.js. | ||
As of now, that is Node.js 14.14+ and 16.0+. | ||
It also works in Deno and modern browsers. | ||
## Related | ||
* [`parse-english`](https://github.com/wooorm/parse-english) | ||
— English (natural language) parser | ||
* [`parse-dutch`](https://github.com/wooorm/parse-dutch) | ||
— Dutch (natural language) parser | ||
## Contribute | ||
Yes please! | ||
See [How to Contribute to Open Source][contribute]. | ||
## Security | ||
This package is safe. | ||
## License | ||
@@ -140,6 +203,2 @@ | ||
[chat-badge]: https://img.shields.io/badge/join%20the%20community-on%20spectrum-7b16ff.svg | ||
[chat]: https://spectrum.chat/unified/retext | ||
[npm]: https://docs.npmjs.com/cli/install | ||
@@ -149,2 +208,10 @@ | ||
[esm]: https://gist.github.com/sindresorhus/a39789f98801d908bbc7ff3ecc99d99c | ||
[esmsh]: https://esm.sh | ||
[typescript]: https://www.typescriptlang.org | ||
[contribute]: https://opensource.guide/how-to-contribute/ | ||
[license]: license | ||
@@ -154,4 +221,10 @@ | ||
[retext]: https://github.com/retextjs/retext | ||
[nlcst]: https://github.com/syntax-tree/nlcst | ||
[nlcst]: https://github.com/syntax-tree/nlcst | ||
[root]: https://github.com/syntax-tree/nlcst#root | ||
[retext-latin]: https://github.com/retextjs/retext/tree/main/packages/retext-latin | ||
[parse-english]: https://github.com/wooorm/parse-english | ||
[parse-dutch]: https://github.com/wooorm/parse-dutch |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
68711
43
1280
224
6
15
+ Added@types/nlcst@^1.0.0
+ Added@types/unist@^2.0.0
+ Addedvfile@^5.0.0
+ Addedis-buffer@2.0.5(transitive)
+ Addedunist-util-stringify-position@3.0.3(transitive)
+ Addedvfile@5.3.7(transitive)
+ Addedvfile-message@3.1.4(transitive)