Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

parse-latin

Package Overview
Dependencies
Maintainers
1
Versions
37
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

parse-latin - npm Package Compare versions

Comparing version 5.0.1 to 6.0.0

index.d.ts

522

lib/index.js

@@ -0,1 +1,12 @@

/**
* @typedef {import('vfile').VFile} VFile
* @typedef {import('nlcst').Parent} Parent
* @typedef {import('nlcst').Content} Content
* @typedef {import('nlcst').SentenceContent} SentenceContent
* @typedef {import('nlcst').Root} Root
* @typedef {import('nlcst').Paragraph} Paragraph
* @typedef {import('nlcst').Sentence} Sentence
*/
import {toString} from 'nlcst-to-string'
import {mergeInitialWordSymbol} from './plugin/merge-initial-word-symbol.js'

@@ -6,3 +17,2 @@ import {mergeFinalWordSymbol} from './plugin/merge-final-word-symbol.js'

import {mergeInitialisms} from './plugin/merge-initialisms.js'
import {mergeWords} from './plugin/merge-words.js'
import {patchPosition} from './plugin/patch-position.js'

@@ -20,3 +30,2 @@ import {mergeNonWordSentences} from './plugin/merge-non-word-sentences.js'

import {removeEmptyNodes} from './plugin/remove-empty-nodes.js'
import {parserFactory} from './parser.js'
import {

@@ -35,25 +44,28 @@ newLine,

export class ParseLatin {
/**
* @param {string|null|undefined} [doc]
* @param {VFile|null|undefined} [file]
*/
constructor(doc, file) {
const value = file || doc
/** @type {string|null} */
this.doc = value ? String(value) : null
}
// Run transform plugins for `key` on `nodes`.
run(key, nodes) {
const wareKey = key + 'Plugins'
const plugins = this[wareKey]
let index = -1
if (plugins) {
while (plugins[++index]) {
plugins[index](nodes)
}
}
return nodes
/** @type {Array<(node: Root) => void>} */
this.tokenizeRootPlugins = [...this.tokenizeRootPlugins]
/** @type {Array<(node: Paragraph) => void>} */
this.tokenizeParagraphPlugins = [...this.tokenizeParagraphPlugins]
/** @type {Array<(node: Sentence) => void>} */
this.tokenizeSentencePlugins = [...this.tokenizeSentencePlugins]
}
// Easy access to the document parser. This additionally supports retext-style
// invocation: where an instance is created for each file, and the file is given
// on construction.
/**
* Easy access to the document parser.
* This additionally supports `retext`-like call: where an instance is
* created for each file, and the file is given on construction.
*
* @param {string|undefined|null} [value]
* @returns {Root}
*/
parse(value) {

@@ -63,259 +75,164 @@ return this.tokenizeRoot(value || this.doc)

// Transform a `value` into a list of `NLCSTNode`s.
tokenize(value) {
const tokens = []
/**
* Parse as a root.
*
* @param {string|undefined|null} [value]
* @returns {Root}
*/
tokenizeRoot(value) {
const paragraph = this.tokenizeParagraph(value)
/** @type {Root} */
const result = {
type: 'RootNode',
children: splitNode(paragraph, 'WhiteSpaceNode', newLine)
}
if (value === null || value === undefined) {
value = ''
} else if (value instanceof String) {
value = value.toString()
let index = -1
while (this.tokenizeRootPlugins[++index]) {
this.tokenizeRootPlugins[index](result)
}
if (typeof value !== 'string') {
// Return the given nodes if this is either an empty array, or an array with
// a node as a first child.
if ('length' in value && (!value[0] || value[0].type)) {
return value
}
return result
}
throw new Error(
"Illegal invocation: '" +
value +
"' is not a valid argument for 'ParseLatin'"
)
/**
* Parse as a paragraph.
*
* @param {string|undefined|null} [value]
* @returns {Paragraph}
*/
tokenizeParagraph(value) {
const sentence = this.tokenizeSentence(value)
/** @type {Paragraph} */
const result = {
type: 'ParagraphNode',
children: splitNode(sentence, 'PunctuationNode', terminalMarker)
}
if (!value) {
return tokens
let index = -1
while (this.tokenizeParagraphPlugins[++index]) {
this.tokenizeParagraphPlugins[index](result)
}
// Eat mechanism to use.
const eater = this.position ? eat : noPositionEat
return result
}
let index = 0
let offset = 0
let line = 1
let column = 1
let previous = ''
let queue = ''
let left
let right
let character
/**
* Parse as a sentence.
*
* @param {string|undefined|null} [value]
* @returns {Sentence}
*/
tokenizeSentence(value) {
const children = this.tokenize(value)
/** @type {Sentence} */
const result = {type: 'SentenceNode', children}
while (index < value.length) {
character = value.charAt(index)
let index = -1
while (this.tokenizeSentencePlugins[++index]) {
this.tokenizeSentencePlugins[index](result)
}
if (whiteSpace.test(character)) {
right = 'WhiteSpace'
} else if (punctuation.test(character)) {
right = 'Punctuation'
} else if (word.test(character)) {
right = 'Word'
} else {
right = 'Symbol'
}
return result
}
tick.call(this)
/**
* Transform a `value` into a list of `NLCSTNode`s.
*
* @param {string|undefined|null} [value]
* @returns {Array<SentenceContent>}
*/
tokenize(value) {
/** @type {Array<SentenceContent>} */
const children = []
previous = character
character = ''
left = right
right = null
index++
if (!value) {
return children
}
tick.call(this)
const currentPoint = {line: 1, column: 1, offset: 0}
let from = 0
let index = 0
let start = {...currentPoint}
/** @type {SentenceContent['type']|undefined} */
let previousType
/** @type {string|undefined} */
let previous
return tokens
while (index < value.length) {
const current = value.charAt(index)
const currentType = whiteSpace.test(current)
? 'WhiteSpaceNode'
: punctuation.test(current)
? 'PunctuationNode'
: word.test(current)
? 'WordNode'
: 'SymbolNode'
// Check one character.
function tick() {
if (
left === right &&
(left === 'Word' ||
left === 'WhiteSpace' ||
character === previous ||
surrogates.test(character))
from < index &&
previousType &&
currentType &&
!(
previousType === currentType &&
// Words or white space continue.
(previousType === 'WordNode' ||
previousType === 'WhiteSpaceNode' ||
// Same character of punctuation or symbol also continues.
current === previous ||
// Surrogates of punctuation or symbol also continue.
surrogates.test(current))
)
) {
queue += character
} else {
// Flush the previous queue.
if (queue) {
this['tokenize' + left](queue, eater)
}
queue = character
children.push(createNode(previousType, value.slice(from, index)))
from = index
start = {...currentPoint}
}
}
// Remove `subvalue` from `value`.
// Expects `subvalue` to be at the start from `value`, and applies no
// validation.
function eat(subvalue) {
const pos = position()
update(subvalue)
return apply
// Add the given arguments, add `position` to the returned node, and return
// the node.
function apply(...input) {
return pos(add(...input))
if (current === '\r' || (current === '\n' && previous !== '\r')) {
currentPoint.line++
currentPoint.column = 1
} else if (current !== '\n') {
currentPoint.column++
}
}
// Remove `subvalue` from `value`.
// Does not patch positional information.
function noPositionEat() {
return add
currentPoint.offset++
previousType = currentType
previous = current
index++
}
// Add mechanism.
function add(node, parent) {
if (parent) {
parent.children.push(node)
} else {
tokens.push(node)
}
return node
if (previousType && from < index) {
children.push(createNode(previousType, value.slice(from, index)))
}
// Mark position and patch `node.position`.
function position() {
const before = now()
return children
// Add the position to a node.
function patch(node) {
node.position = new Position(before)
return node
}
return patch
/**
*
* @param {SentenceContent['type']} type
* @param {string} value
* @returns {SentenceContent}
*/
function createNode(type, value) {
return type === 'WordNode'
? {
type: 'WordNode',
children: [
{
type: 'TextNode',
value,
position: {start, end: {...currentPoint}}
}
],
position: {start, end: {...currentPoint}}
}
: {type, value, position: {start, end: {...currentPoint}}}
}
// Update line and column based on `value`.
function update(subvalue) {
let character = -1
let lastIndex = -1
offset += subvalue.length
while (++character < subvalue.length) {
if (subvalue.charAt(character) === '\n') {
lastIndex = character
line++
}
}
if (lastIndex < 0) {
column += subvalue.length
} else {
column = subvalue.length - lastIndex
}
}
// Store position information for a node.
function Position(start) {
this.start = start
this.end = now()
}
// Get the current position.
function now() {
return {line, column, offset}
}
}
}
// Default position.
ParseLatin.prototype.position = true
// Create text nodes.
ParseLatin.prototype.tokenizeSymbol = createTextFactory('Symbol')
ParseLatin.prototype.tokenizeWhiteSpace = createTextFactory('WhiteSpace')
ParseLatin.prototype.tokenizePunctuation = createTextFactory('Punctuation')
ParseLatin.prototype.tokenizeSource = createTextFactory('Source')
ParseLatin.prototype.tokenizeText = createTextFactory('Text')
// Inject `plugins` to modifiy the result of the method at `key` on the operated
// on context.
ParseLatin.prototype.use = useFactory(function (context, key, plugins) {
context[key] = context[key].concat(plugins)
})
// Inject `plugins` to modifiy the result of the method at `key` on the operated
// on context, before any other.
ParseLatin.prototype.useFirst = useFactory(function (context, key, plugins) {
context[key] = plugins.concat(context[key])
})
// PARENT NODES
//
// All these nodes are `pluggable`: they come with a `use` method which accepts
// a plugin (`function(NLCSTNode)`).
// Every time one of these methods are called, the plugin is invoked with the
// node, allowing for easy modification.
//
// In fact, the internal transformation from `tokenize` (a list of words, white
// space, punctuation, and symbols) to `tokenizeRoot` (an NLCST tree), is also
// implemented through this mechanism.
// Create a `WordNode` with its children set to a single `TextNode`, its value
// set to the given `value`.
pluggable(ParseLatin, 'tokenizeWord', function (value, eat) {
const add = (eat || noopEat)('')
const parent = {type: 'WordNode', children: []}
this.tokenizeText(value, eat, parent)
return add(parent)
})
// Create a `SentenceNode` with its children set to `Node`s, their values set
// to the tokenized given `value`.
//
// Unless plugins add new nodes, the sentence is populated by `WordNode`s,
// `SymbolNode`s, `PunctuationNode`s, and `WhiteSpaceNode`s.
pluggable(
ParseLatin,
'tokenizeSentence',
parserFactory({type: 'SentenceNode', tokenizer: 'tokenize'})
)
// Create a `ParagraphNode` with its children set to `Node`s, their values set
// to the tokenized given `value`.
//
// Unless plugins add new nodes, the paragraph is populated by `SentenceNode`s
// and `WhiteSpaceNode`s.
pluggable(
ParseLatin,
'tokenizeParagraph',
parserFactory({
type: 'ParagraphNode',
delimiter: terminalMarker,
delimiterType: 'PunctuationNode',
tokenizer: 'tokenizeSentence'
})
)
// Create a `RootNode` with its children set to `Node`s, their values set to the
// tokenized given `value`.
pluggable(
ParseLatin,
'tokenizeRoot',
parserFactory({
type: 'RootNode',
delimiter: newLine,
delimiterType: 'WhiteSpaceNode',
tokenizer: 'tokenizeParagraph'
})
)
// PLUGINS
ParseLatin.prototype.use('tokenizeSentence', [
/** List of transforms handling a sentence. */
ParseLatin.prototype.tokenizeSentencePlugins = [
mergeInitialWordSymbol,

@@ -326,7 +243,7 @@ mergeFinalWordSymbol,

mergeInitialisms,
mergeWords,
patchPosition
])
]
ParseLatin.prototype.use('tokenizeParagraph', [
/** List of transforms handling a paragraph. */
ParseLatin.prototype.tokenizeParagraphPlugins = [
mergeNonWordSentences,

@@ -344,5 +261,6 @@ mergeAffixSymbol,

patchPosition
])
]
ParseLatin.prototype.use('tokenizeRoot', [
/** List of transforms handling a root. */
ParseLatin.prototype.tokenizeRootPlugins = [
makeInitialWhiteSpaceSiblings,

@@ -352,80 +270,48 @@ makeFinalWhiteSpaceSiblings,

patchPosition
])
]
// TEXT NODES
/**
* A function that splits one node into several nodes.
*
* @template {Parent} TheNode
* @param {TheNode} node
* @param {RegExp} expression
* @param {Content['type']} childType
* @returns {Array<TheNode>}
*/
function splitNode(node, childType, expression) {
/** @type {Array<TheNode>} */
const result = []
let index = -1
let start = 0
// Factory to create a `Text`.
function createTextFactory(type) {
type += 'Node'
while (++index < node.children.length) {
const token = node.children[index]
return createText
if (
index === node.children.length - 1 ||
(token.type === childType && expression.test(toString(token)))
) {
/** @type {TheNode} */
// @ts-expect-error: fine
const parent = {
type: node.type,
children: node.children.slice(start, index + 1)
}
// Construct a `Text` from a bound `type`
function createText(value, eat, parent) {
if (value === null || value === undefined) {
value = ''
}
const first = node.children[start]
const last = token
if (first.position && last.position) {
parent.position = {
start: first.position.start,
end: last.position.end
}
}
return (eat || noopEat)(value)({type, value: String(value)}, parent)
}
}
// Make a method “pluggable”.
function pluggable(Constructor, key, callback) {
// Set a pluggable version of `callback` on `Constructor`.
Constructor.prototype[key] = function (...input) {
return this.run(key, callback.apply(this, input))
}
}
// Factory to inject `plugins`. Takes `callback` for the actual inserting.
function useFactory(callback) {
return use
// Validate if `plugins` can be inserted.
// Invokes the bound `callback` to do the actual inserting.
function use(key, plugins) {
// Throw if the method is not pluggable.
if (!(key in this)) {
throw new Error(
'Illegal Invocation: Unsupported `key` for ' +
'`use(key, plugins)`. Make sure `key` is a ' +
'supported function'
)
result.push(parent)
start = index + 1
}
// Fail silently when no plugins are given.
if (!plugins) {
return
}
const wareKey = key + 'Plugins'
// Make sure `plugins` is a list.
plugins = typeof plugins === 'function' ? [plugins] : plugins.concat()
// Make sure `wareKey` exists.
if (!this[wareKey]) {
this[wareKey] = []
}
// Invoke callback with the ware key and plugins.
callback(this, wareKey, plugins)
}
}
// Add mechanism used when text-tokenisers are called directly outside of the
// `tokenize` function.
function noopAdd(node, parent) {
if (parent) {
parent.children.push(node)
}
return node
return result
}
// Eat and add mechanism without adding positional information, used when
// text-tokenisers are called directly outside of the `tokenize` function.
function noopEat() {
return noopAdd
}

@@ -0,1 +1,6 @@

/**
* @typedef {import('nlcst').Paragraph} Paragraph
* @typedef {import('nlcst').Sentence} Sentence
*/
import {toString} from 'nlcst-to-string'

@@ -5,48 +10,51 @@ import {modifyChildren} from 'unist-util-modify-children'

// Break a sentence if a white space with more than one new-line is found.
export const breakImplicitSentences = modifyChildren(function (
child,
index,
parent
) {
if (child.type !== 'SentenceNode') {
return
}
export const breakImplicitSentences = modifyChildren(
/**
* @type {import('unist-util-modify-children').Modifier<Paragraph>}
*/
const children = child.children
function (child, index, parent) {
if (child.type !== 'SentenceNode') {
return
}
// Ignore first and last child.
let position = 0
const children = child.children
while (++position < children.length - 1) {
const node = children[position]
// Ignore first and last child.
let position = 0
if (
node.type !== 'WhiteSpaceNode' ||
toString(node).split(/\r\n|\r|\n/).length < 3
) {
continue
}
while (++position < children.length - 1) {
const node = children[position]
child.children = children.slice(0, position)
if (
node.type !== 'WhiteSpaceNode' ||
toString(node).split(/\r\n|\r|\n/).length < 3
) {
continue
}
const insertion = {
type: 'SentenceNode',
children: children.slice(position + 1)
}
child.children = children.slice(0, position)
const tail = children[position - 1]
const head = children[position + 1]
/** @type {Sentence} */
const insertion = {
type: 'SentenceNode',
children: children.slice(position + 1)
}
parent.children.splice(index + 1, 0, node, insertion)
const tail = children[position - 1]
const head = children[position + 1]
if (child.position && tail.position && head.position) {
const end = child.position.end
parent.children.splice(index + 1, 0, node, insertion)
child.position.end = tail.position.end
if (child.position && tail.position && head.position) {
const end = child.position.end
insertion.position = {start: head.position.start, end}
child.position.end = tail.position.end
insertion.position = {start: head.position.start, end}
}
return index + 1
}
return index + 1
}
})
)

@@ -0,1 +1,6 @@

/**
* @typedef {import('nlcst').Root} Root
* @typedef {import('nlcst').Paragraph} Paragraph
*/
import {modifyChildren} from 'unist-util-modify-children'

@@ -5,24 +10,25 @@

// paragraphs.
export const makeFinalWhiteSpaceSiblings = modifyChildren(function (
child,
index,
parent
) {
const children = child.children
export const makeFinalWhiteSpaceSiblings = modifyChildren(
/**
* @type {import('unist-util-modify-children').Modifier<Root | Paragraph>}
*/
if (
children &&
children.length > 0 &&
children[children.length - 1].type === 'WhiteSpaceNode'
) {
parent.children.splice(index + 1, 0, child.children.pop())
const previous = children[children.length - 1]
function (child, index, parent) {
if ('children' in child) {
const tail = child.children[child.children.length - 1]
if (previous && previous.position && child.position) {
child.position.end = previous.position.end
if (tail && tail.type === 'WhiteSpaceNode') {
child.children.pop() // Remove `tail`.
parent.children.splice(index + 1, 0, tail)
const previous = child.children[child.children.length - 1]
if (previous && previous.position && child.position) {
child.position.end = previous.position.end
}
// Next, iterate over the current node again.
return index
}
}
// Next, iterate over the current node again.
return index
}
})
)

@@ -0,1 +1,6 @@

/**
* @typedef {import('nlcst').Root} Root
* @typedef {import('nlcst').Paragraph} Paragraph
*/
import {visitChildren} from 'unist-util-visit-children'

@@ -5,20 +10,20 @@

// sentences.
export const makeInitialWhiteSpaceSiblings = visitChildren(function (
child,
index,
parent
) {
const children = child.children
if (
children &&
children.length > 0 &&
children[0].type === 'WhiteSpaceNode'
) {
parent.children.splice(index, 0, children.shift())
const next = children[0]
export const makeInitialWhiteSpaceSiblings = visitChildren(
/**
* @type {import('unist-util-visit-children').Visitor<Paragraph|Root>}
*/
function (child, index, parent) {
if ('children' in child && child.children) {
const head = child.children[0]
if (head && head.type === 'WhiteSpaceNode') {
child.children.shift()
parent.children.splice(index, 0, head)
const next = child.children[0]
if (next && next.position && child.position) {
child.position.start = next.position.start
if (next && next.position && child.position) {
child.position.start = next.position.start
}
}
}
}
})
)

@@ -0,1 +1,5 @@

/**
* @typedef {import('nlcst').Paragraph} Paragraph
*/
import {toString} from 'nlcst-to-string'

@@ -6,43 +10,46 @@ import {modifyChildren} from 'unist-util-modify-children'

// comma.
export const mergeAffixExceptions = modifyChildren(function (
child,
index,
parent
) {
const children = child.children
export const mergeAffixExceptions = modifyChildren(
/**
* @type {import('unist-util-modify-children').Modifier<Paragraph>}
*/
function (child, index, parent) {
const previous = parent.children[index - 1]
if (!children || children.length === 0 || index < 1) {
return
}
if (
previous &&
'children' in previous &&
'children' in child &&
child.children.length > 0
) {
let position = -1
let position = -1
while (child.children[++position]) {
const node = child.children[position]
while (children[++position]) {
const node = children[position]
if (node.type === 'WordNode') {
return
}
if (node.type === 'WordNode') {
return
}
if (node.type === 'SymbolNode' || node.type === 'PunctuationNode') {
const value = toString(node)
if (node.type === 'SymbolNode' || node.type === 'PunctuationNode') {
const value = toString(node)
if (value !== ',' && value !== ';') {
return
}
if (value !== ',' && value !== ';') {
return
}
previous.children.push(...child.children)
const previousChild = parent.children[index - 1]
previousChild.children = previousChild.children.concat(children)
// Update position.
if (previous.position && child.position) {
previous.position.end = child.position.end
}
// Update position.
if (previousChild.position && child.position) {
previousChild.position.end = child.position.end
parent.children.splice(index, 1)
// Next, iterate over the node *now* at the current position.
return index
}
}
parent.children.splice(index, 1)
// Next, iterate over the node *now* at the current position.
return index
}
}
})
)

@@ -0,1 +1,5 @@

/**
* @typedef {import('nlcst').Paragraph} Paragraph
*/
import {toString} from 'nlcst-to-string'

@@ -11,29 +15,35 @@ import {modifyChildren} from 'unist-util-modify-children'

// sentence) to the previous sentence.
export const mergeAffixSymbol = modifyChildren(function (child, index, parent) {
const children = child.children
export const mergeAffixSymbol = modifyChildren(
/**
* @type {import('unist-util-modify-children').Modifier<Paragraph>}
*/
function (child, index, parent) {
if ('children' in child && child.children.length > 0 && index > 0) {
const previous = parent.children[index - 1]
const first = child.children[0]
const second = child.children[1]
if (children && children.length > 0 && index > 0) {
const first = children[0]
const second = children[1]
const previous = parent.children[index - 1]
if (
previous &&
previous.type === 'SentenceNode' &&
(first.type === 'SymbolNode' || first.type === 'PunctuationNode') &&
affixSymbol.test(toString(first))
) {
child.children.shift() // Remove `first`.
previous.children.push(first)
if (
(first.type === 'SymbolNode' || first.type === 'PunctuationNode') &&
affixSymbol.test(toString(first))
) {
previous.children.push(children.shift())
// Update position.
if (first.position && previous.position) {
previous.position.end = first.position.end
}
// Update position.
if (first.position && previous.position) {
previous.position.end = first.position.end
}
if (second && second.position && child.position) {
child.position.start = second.position.start
}
if (second && second.position && child.position) {
child.position.start = second.position.start
// Next, iterate over the previous node again.
return index - 1
}
// Next, iterate over the previous node again.
return index - 1
}
}
})
)

@@ -0,1 +1,5 @@

/**
* @typedef {import('nlcst').Sentence} Sentence
*/
import {toString} from 'nlcst-to-string'

@@ -5,37 +9,38 @@ import {modifyChildren} from 'unist-util-modify-children'

// Merge certain punctuation marks into their preceding words.
export const mergeFinalWordSymbol = modifyChildren(function (
child,
index,
parent
) {
if (
index > 0 &&
(child.type === 'SymbolNode' || child.type === 'PunctuationNode') &&
toString(child) === '-'
) {
const children = parent.children
const previous = children[index - 1]
const next = children[index + 1]
export const mergeFinalWordSymbol = modifyChildren(
/**
* @type {import('unist-util-modify-children').Modifier<Sentence>}
*/
function (child, index, parent) {
if (
(!next || next.type !== 'WordNode') &&
previous &&
previous.type === 'WordNode'
index > 0 &&
(child.type === 'SymbolNode' || child.type === 'PunctuationNode') &&
toString(child) === '-'
) {
// Remove `child` from parent.
children.splice(index, 1)
const children = parent.children
const previous = children[index - 1]
const next = children[index + 1]
// Add the punctuation mark at the end of the previous node.
previous.children.push(child)
if (
(!next || next.type !== 'WordNode') &&
previous &&
previous.type === 'WordNode'
) {
// Remove `child` from parent.
children.splice(index, 1)
// Update position.
if (previous.position && child.position) {
previous.position.end = child.position.end
// Add the punctuation mark at the end of the previous node.
previous.children.push(child)
// Update position.
if (previous.position && child.position) {
previous.position.end = child.position.end
}
// Next, iterate over the node *now* at the current position (which was
// the next node).
return index
}
// Next, iterate over the node *now* at the current position (which was
// the next node).
return index
}
}
})
)

@@ -0,1 +1,5 @@

/**
* @typedef {import('nlcst').Paragraph} Paragraph
*/
import {toString} from 'nlcst-to-string'

@@ -7,29 +11,30 @@ import {modifyChildren} from 'unist-util-modify-children'

// lower case letter.
export const mergeInitialDigitSentences = modifyChildren(function (
child,
index,
parent
) {
const children = child.children
const siblings = parent.children
const previous = siblings[index - 1]
const head = children[0]
export const mergeInitialDigitSentences = modifyChildren(
/**
* @type {import('unist-util-modify-children').Modifier<Paragraph>}
*/
function (child, index, parent) {
const previous = parent.children[index - 1]
if (
previous &&
head &&
head.type === 'WordNode' &&
digitStart.test(toString(head))
) {
previous.children = previous.children.concat(children)
siblings.splice(index, 1)
if (
previous &&
previous.type === 'SentenceNode' &&
child.type === 'SentenceNode'
) {
const head = child.children[0]
// Update position.
if (previous.position && child.position) {
previous.position.end = child.position.end
if (head && head.type === 'WordNode' && digitStart.test(toString(head))) {
previous.children.push(...child.children)
parent.children.splice(index, 1)
// Update position.
if (previous.position && child.position) {
previous.position.end = child.position.end
}
// Next, iterate over the node *now* at the current position.
return index
}
}
// Next, iterate over the node *now* at the current position.
return index
}
})
)

@@ -0,1 +1,5 @@

/**
* @typedef {import('nlcst').Paragraph} Paragraph
*/
import {toString} from 'nlcst-to-string'

@@ -9,41 +13,42 @@ import {modifyChildren} from 'unist-util-modify-children'

// lower case letter.
export const mergeInitialLowerCaseLetterSentences = modifyChildren(function (
child,
index,
parent
) {
const children = child.children
export const mergeInitialLowerCaseLetterSentences = modifyChildren(
/**
* @type {import('unist-util-modify-children').Modifier<Paragraph>}
*/
function (child, index, parent) {
if (child.type === 'SentenceNode' && index > 0) {
const previous = parent.children[index - 1]
const children = child.children
if (children && children.length > 0 && index > 0) {
let position = -1
if (children.length > 0 && previous.type === 'SentenceNode') {
let position = -1
while (children[++position]) {
const node = children[position]
while (children[++position]) {
const node = children[position]
if (node.type === 'WordNode') {
if (!lowerInitial.test(toString(node))) {
return
}
if (node.type === 'WordNode') {
if (!lowerInitial.test(toString(node))) {
return
}
const siblings = parent.children
const previous = siblings[index - 1]
previous.children.push(...children)
previous.children = previous.children.concat(children)
parent.children.splice(index, 1)
siblings.splice(index, 1)
// Update position.
if (previous.position && child.position) {
previous.position.end = child.position.end
}
// Update position.
if (previous.position && child.position) {
previous.position.end = child.position.end
// Next, iterate over the node *now* at the current position.
return index
}
if (node.type === 'SymbolNode' || node.type === 'PunctuationNode') {
return
}
}
// Next, iterate over the node *now* at the current position.
return index
}
if (node.type === 'SymbolNode' || node.type === 'PunctuationNode') {
return
}
}
}
})
)

@@ -0,1 +1,5 @@

/**
* @typedef {import('nlcst').Sentence} Sentence
*/
import {toString} from 'nlcst-to-string'

@@ -5,39 +9,40 @@ import {modifyChildren} from 'unist-util-modify-children'

// Merge certain punctuation marks into their following words.
export const mergeInitialWordSymbol = modifyChildren(function (
child,
index,
parent
) {
if (
(child.type !== 'SymbolNode' && child.type !== 'PunctuationNode') ||
toString(child) !== '&'
) {
return
}
export const mergeInitialWordSymbol = modifyChildren(
/**
* @type {import('unist-util-modify-children').Modifier<Sentence>}
*/
function (child, index, parent) {
if (
(child.type !== 'SymbolNode' && child.type !== 'PunctuationNode') ||
toString(child) !== '&'
) {
return
}
const children = parent.children
const next = children[index + 1]
const children = parent.children
const next = children[index + 1]
// If either a previous word, or no following word, exists, exit early.
if (
(index > 0 && children[index - 1].type === 'WordNode') ||
!(next && next.type === 'WordNode')
) {
return
}
// If either a previous word, or no following word, exists, exit early.
if (
(index > 0 && children[index - 1].type === 'WordNode') ||
!(next && next.type === 'WordNode')
) {
return
}
// Remove `child` from parent.
children.splice(index, 1)
// Remove `child` from parent.
children.splice(index, 1)
// Add the punctuation mark at the start of the next node.
next.children.unshift(child)
// Add the punctuation mark at the start of the next node.
next.children.unshift(child)
// Update position.
if (next.position && child.position) {
next.position.start = child.position.start
// Update position.
if (next.position && child.position) {
next.position.start = child.position.start
}
// Next, iterate over the node at the previous position, as it's now adjacent
// to a following word.
return index - 1
}
// Next, iterate over the node at the previous position, as it's now adjacent
// to a following word.
return index - 1
})
)

@@ -0,1 +1,5 @@

/**
* @typedef {import('nlcst').Sentence} Sentence
*/
import {toString} from 'nlcst-to-string'

@@ -6,58 +10,64 @@ import {modifyChildren} from 'unist-util-modify-children'

// Merge initialisms.
export const mergeInitialisms = modifyChildren(function (child, index, parent) {
if (index > 0 && toString(child) === '.') {
const siblings = parent.children
const previous = siblings[index - 1]
const children = previous.children
export const mergeInitialisms = modifyChildren(
/**
* @type {import('unist-util-modify-children').Modifier<Sentence>}
*/
function (child, index, parent) {
if (
previous.type === 'WordNode' &&
children &&
children.length !== 1 &&
children.length % 2 !== 0
index > 0 &&
child.type === 'PunctuationNode' &&
toString(child) === '.'
) {
let position = children.length
let isAllDigits = true
const previous = parent.children[index - 1]
while (children[--position]) {
const otherChild = children[position]
if (
previous.type === 'WordNode' &&
previous.children &&
previous.children.length !== 1 &&
previous.children.length % 2 !== 0
) {
let position = previous.children.length
let isAllDigits = true
const value = toString(otherChild)
while (previous.children[--position]) {
const otherChild = previous.children[position]
if (position % 2 === 0) {
// Initialisms consist of one character values.
if (value.length > 1) {
return
}
const value = toString(otherChild)
if (!numerical.test(value)) {
isAllDigits = false
if (position % 2 === 0) {
// Initialisms consist of one character values.
if (value.length > 1) {
return
}
if (!numerical.test(value)) {
isAllDigits = false
}
} else if (value !== '.') {
if (position < previous.children.length - 2) {
break
} else {
return
}
}
} else if (value !== '.') {
if (position < children.length - 2) {
break
} else {
return
}
}
}
if (!isAllDigits) {
// Remove `child` from parent.
siblings.splice(index, 1)
if (!isAllDigits) {
// Remove `child` from parent.
parent.children.splice(index, 1)
// Add child to the previous children.
children.push(child)
// Add child to the previous children.
previous.children.push(child)
// Update position.
if (previous.position && child.position) {
previous.position.end = child.position.end
// Update position.
if (previous.position && child.position) {
previous.position.end = child.position.end
}
// Next, iterate over the node *now* at the current position.
return index
}
// Next, iterate over the node *now* at the current position.
return index
}
}
}
})
)

@@ -0,50 +1,57 @@

/**
* @typedef {import('nlcst').SentenceContent} SentenceContent
* @typedef {import('nlcst').WordContent} WordContent
* @typedef {import('nlcst').Sentence} Sentence
*/
import {toString} from 'nlcst-to-string'
import {modifyChildren} from 'unist-util-modify-children'
const slash = '/'
// Merge words joined by certain punctuation marks.
export const mergeInnerWordSlash = modifyChildren(function (
child,
index,
parent
) {
const siblings = parent.children
const previous = siblings[index - 1]
const next = siblings[index + 1]
export const mergeInnerWordSlash = modifyChildren(
/**
* @type {import('unist-util-modify-children').Modifier<Sentence>}
*/
function (child, index, parent) {
const siblings = parent.children
const previous = siblings[index - 1]
if (
previous &&
previous.type === 'WordNode' &&
(child.type === 'SymbolNode' || child.type === 'PunctuationNode') &&
toString(child) === slash
) {
const previousValue = toString(previous)
let tail = child
let queue = [child]
let count = 1
let nextValue = ''
if (
previous &&
previous.type === 'WordNode' &&
(child.type === 'SymbolNode' || child.type === 'PunctuationNode') &&
toString(child) === '/'
) {
const previousValue = toString(previous)
/** @type {SentenceContent} */
let tail = child
/** @type {Array<WordContent>} */
const queue = [child]
let count = 1
let nextValue = ''
const next = siblings[index + 1]
if (next && next.type === 'WordNode') {
nextValue = toString(next)
tail = next
queue = queue.concat(next.children)
count++
}
if (next && next.type === 'WordNode') {
nextValue = toString(next)
tail = next
queue.push(...next.children)
count++
}
if (previousValue.length < 3 && (!nextValue || nextValue.length < 3)) {
// Add all found tokens to `prev`s children.
previous.children = previous.children.concat(queue)
if (previousValue.length < 3 && (!nextValue || nextValue.length < 3)) {
// Add all found tokens to `prev`s children.
previous.children.push(...queue)
siblings.splice(index, count)
siblings.splice(index, count)
// Update position.
if (previous.position && tail.position) {
previous.position.end = tail.position.end
// Update position.
if (previous.position && tail.position) {
previous.position.end = tail.position.end
}
// Next, iterate over the node *now* at the current position.
return index
}
// Next, iterate over the node *now* at the current position.
return index
}
}
})
)

@@ -0,1 +1,6 @@

/**
* @typedef {import('nlcst').Sentence} Sentence
* @typedef {import('nlcst').WordContent} WordContent
*/
import {toString} from 'nlcst-to-string'

@@ -8,66 +13,69 @@ import {modifyChildren} from 'unist-util-modify-children'

// Merge words joined by certain punctuation marks.
export const mergeInnerWordSymbol = modifyChildren(function (
child,
index,
parent
) {
if (
index > 0 &&
(child.type === 'SymbolNode' || child.type === 'PunctuationNode')
) {
const siblings = parent.children
const previous = siblings[index - 1]
export const mergeInnerWordSymbol = modifyChildren(
/**
* @type {import('unist-util-modify-children').Modifier<Sentence>}
*/
function (child, index, parent) {
if (
index > 0 &&
(child.type === 'SymbolNode' || child.type === 'PunctuationNode')
) {
const siblings = parent.children
const previous = siblings[index - 1]
if (previous && previous.type === 'WordNode') {
let position = index - 1
let tokens = []
let queue = []
if (previous && previous.type === 'WordNode') {
let position = index - 1
/** @type {Array<WordContent>} */
const tokens = []
/** @type {Array<WordContent>} */
let queue = []
// - If a token which is neither word nor inner word symbol is found,
// the loop is broken
// - If an inner word symbol is found, it’s queued
// - If a word is found, it’s queued (and the queue stored and emptied)
while (siblings[++position]) {
const sibling = siblings[position]
// - If a token which is neither word nor inner word symbol is found,
// the loop is broken
// - If an inner word symbol is found, it’s queued
// - If a word is found, it’s queued (and the queue stored and emptied)
while (siblings[++position]) {
const sibling = siblings[position]
if (sibling.type === 'WordNode') {
tokens = tokens.concat(queue, sibling.children)
if (sibling.type === 'WordNode') {
tokens.push(...queue, ...sibling.children)
queue = []
} else if (
(sibling.type === 'SymbolNode' ||
sibling.type === 'PunctuationNode') &&
wordSymbolInner.test(toString(sibling))
) {
queue.push(sibling)
} else {
break
queue = []
} else if (
(sibling.type === 'SymbolNode' ||
sibling.type === 'PunctuationNode') &&
wordSymbolInner.test(toString(sibling))
) {
queue.push(sibling)
} else {
break
}
}
}
if (tokens.length > 0) {
// If there is a queue, remove its length from `position`.
if (queue.length > 0) {
position -= queue.length
}
if (tokens.length > 0) {
// If there is a queue, remove its length from `position`.
if (queue.length > 0) {
position -= queue.length
}
// Remove every (one or more) inner-word punctuation marks and children
// of words.
siblings.splice(index, position - index)
// Remove every (one or more) inner-word punctuation marks and children
// of words.
siblings.splice(index, position - index)
// Add all found tokens to `prev`s children.
previous.children = previous.children.concat(tokens)
// Add all found tokens to `prev`s children.
previous.children.push(...tokens)
const last = tokens[tokens.length - 1]
const last = tokens[tokens.length - 1]
// Update position.
if (previous.position && last.position) {
previous.position.end = last.position.end
// Update position.
if (previous.position && last.position) {
previous.position.end = last.position.end
}
// Next, iterate over the node *now* at the current position.
return index
}
// Next, iterate over the node *now* at the current position.
return index
}
}
}
})
)

@@ -0,1 +1,5 @@

/**
* @typedef {import('nlcst').Paragraph} Paragraph
*/
import {modifyChildren} from 'unist-util-modify-children'

@@ -5,47 +9,49 @@

// contain word tokens.
export const mergeNonWordSentences = modifyChildren(function (
child,
index,
parent
) {
const children = child.children
let position = -1
export const mergeNonWordSentences = modifyChildren(
/**
* @type {import('unist-util-modify-children').Modifier<Paragraph>}
*/
function (child, index, parent) {
if ('children' in child) {
let position = -1
while (children[++position]) {
if (children[position].type === 'WordNode') {
return
}
}
while (child.children[++position]) {
if (child.children[position].type === 'WordNode') {
return
}
}
const previous = parent.children[index - 1]
const previous = parent.children[index - 1]
if (previous) {
previous.children = previous.children.concat(children)
if (previous && 'children' in previous) {
previous.children.push(...child.children)
// Remove the child.
parent.children.splice(index, 1)
// Remove the child.
parent.children.splice(index, 1)
// Patch position.
if (previous.position && child.position) {
previous.position.end = child.position.end
}
// Patch position.
if (previous.position && child.position) {
previous.position.end = child.position.end
}
// Next, iterate over the node *now* at the current position (which was the
// next node).
return index
}
// Next, iterate over the node *now* at the current position (which was the
// next node).
return index
}
const next = parent.children[index + 1]
const next = parent.children[index + 1]
if (next) {
next.children = children.concat(next.children)
if (next && 'children' in next) {
next.children.unshift(...child.children)
// Patch position.
if (next.position && child.position) {
next.position.start = child.position.start
// Patch position.
if (next.position && child.position) {
next.position.start = child.position.start
}
// Remove the child.
parent.children.splice(index, 1)
}
}
// Remove the child.
parent.children.splice(index, 1)
}
})
)

@@ -0,1 +1,5 @@

/**
* @typedef {import('nlcst').Paragraph} Paragraph
*/
import {toString} from 'nlcst-to-string'

@@ -26,44 +30,46 @@ import {modifyChildren} from 'unist-util-modify-children'

// certain word.
export const mergePrefixExceptions = modifyChildren(function (
child,
index,
parent
) {
const children = child.children
export const mergePrefixExceptions = modifyChildren(
/**
* @type {import('unist-util-modify-children').Modifier<Paragraph>}
*/
function (child, index, parent) {
if ('children' in child && child.children.length > 1) {
const period = child.children[child.children.length - 1]
if (children && children.length > 1) {
const period = children[children.length - 1]
if (period && toString(period) === '.') {
const node = children[children.length - 2]
if (
node &&
node.type === 'WordNode' &&
abbreviationPrefix.test(toString(node).toLowerCase())
period &&
(period.type === 'PunctuationNode' || period.type === 'SymbolNode') &&
toString(period) === '.'
) {
// Merge period into abbreviation.
node.children.push(period)
children.pop()
const node = child.children[child.children.length - 2]
// Update position.
if (period.position && node.position) {
node.position.end = period.position.end
}
if (
node &&
node.type === 'WordNode' &&
abbreviationPrefix.test(toString(node).toLowerCase())
) {
// Merge period into abbreviation.
node.children.push(period)
child.children.pop()
// Merge sentences.
const next = parent.children[index + 1]
// Update position.
if (period.position && node.position) {
node.position.end = period.position.end
}
if (next) {
child.children = children.concat(next.children)
// Merge sentences.
const next = parent.children[index + 1]
parent.children.splice(index + 1, 1)
if (next && next.type === 'SentenceNode') {
child.children.push(...next.children)
parent.children.splice(index + 1, 1)
// Update position.
if (next.position && child.position) {
child.position.end = next.position.end
// Update position.
if (next.position && child.position) {
child.position.end = next.position.end
}
// Next, iterate over the current node again.
return index - 1
}
// Next, iterate over the current node again.
return index - 1
}

@@ -73,2 +79,2 @@ }

}
})
)

@@ -0,1 +1,5 @@

/**
* @typedef {import('nlcst').Paragraph} Paragraph
*/
import {toString} from 'nlcst-to-string'

@@ -10,82 +14,88 @@ import {visitChildren} from 'unist-util-visit-children'

// or the next word (if available).
export const mergeRemainingFullStops = visitChildren(function (child) {
const children = child.children
let position = children.length
let hasFoundDelimiter = false
export const mergeRemainingFullStops = visitChildren(
/**
* @type {import('unist-util-visit-children').Visitor<Paragraph>}
*/
// eslint-disable-next-line complexity
function (child, _, _parent) {
if ('children' in child) {
let position = child.children.length
let hasFoundDelimiter = false
while (children[--position]) {
const grandchild = children[position]
while (child.children[--position]) {
const grandchild = child.children[position]
if (
grandchild.type !== 'SymbolNode' &&
grandchild.type !== 'PunctuationNode'
) {
// This is a sentence without terminal marker, so we 'fool' the code to
// make it think we have found one.
if (grandchild.type === 'WordNode') {
hasFoundDelimiter = true
}
if (
grandchild.type !== 'SymbolNode' &&
grandchild.type !== 'PunctuationNode'
) {
// This is a sentence without terminal marker, so we 'fool' the code to
// make it think we have found one.
if (grandchild.type === 'WordNode') {
hasFoundDelimiter = true
}
continue
}
continue
}
// Exit when this token is not a terminal marker.
if (!terminalMarker.test(toString(grandchild))) {
continue
}
// Exit when this token is not a terminal marker.
if (!terminalMarker.test(toString(grandchild))) {
continue
}
// Ignore the first terminal marker found (starting at the end), as it
// should not be merged.
if (!hasFoundDelimiter) {
hasFoundDelimiter = true
// Ignore the first terminal marker found (starting at the end), as it
// should not be merged.
if (!hasFoundDelimiter) {
hasFoundDelimiter = true
continue
}
continue
}
// Only merge a single full stop.
if (toString(grandchild) !== '.') {
continue
}
// Only merge a single full stop.
if (toString(grandchild) !== '.') {
continue
}
const previous = child.children[position - 1]
const next = child.children[position + 1]
const previous = children[position - 1]
const next = children[position + 1]
if (previous && previous.type === 'WordNode') {
const nextNext = child.children[position + 2]
if (previous && previous.type === 'WordNode') {
const nextNext = children[position + 2]
// Continue when the full stop is followed by a space and another full
// stop, such as: `{.} .`
if (
next &&
nextNext &&
next.type === 'WhiteSpaceNode' &&
toString(nextNext) === '.'
) {
continue
}
// Continue when the full stop is followed by a space and another full
// stop, such as: `{.} .`
if (
next &&
nextNext &&
next.type === 'WhiteSpaceNode' &&
toString(nextNext) === '.'
) {
continue
}
// Remove `child` from parent.
child.children.splice(position, 1)
// Remove `child` from parent.
children.splice(position, 1)
// Add the punctuation mark at the end of the previous node.
previous.children.push(grandchild)
// Add the punctuation mark at the end of the previous node.
previous.children.push(grandchild)
// Update position.
if (grandchild.position && previous.position) {
previous.position.end = grandchild.position.end
}
// Update position.
if (grandchild.position && previous.position) {
previous.position.end = grandchild.position.end
}
position--
} else if (next && next.type === 'WordNode') {
// Remove `child` from parent.
child.children.splice(position, 1)
position--
} else if (next && next.type === 'WordNode') {
// Remove `child` from parent.
children.splice(position, 1)
// Add the punctuation mark at the start of the next node.
next.children.unshift(grandchild)
// Add the punctuation mark at the start of the next node.
next.children.unshift(grandchild)
if (grandchild.position && next.position) {
next.position.start = grandchild.position.start
if (grandchild.position && next.position) {
next.position.start = grandchild.position.start
}
}
}
}
}
})
)

@@ -0,31 +1,47 @@

/**
* @typedef {import('unist').Node} Node
* @typedef {import('nlcst').Sentence} Sentence
* @typedef {import('nlcst').Paragraph} Paragraph
* @typedef {import('nlcst').Root} Root
*/
import {visitChildren} from 'unist-util-visit-children'
// Patch the position on a parent node based on its first and last child.
export const patchPosition = visitChildren(function (child, index, node) {
const siblings = node.children
export const patchPosition = visitChildren(
/**
* @type {import('unist-util-visit-children').Visitor<Paragraph|Sentence|Root>}
*/
function (child, index, node) {
const siblings = node.children
if (!child.position) {
return
}
if (
index < 1 &&
/* c8 ignore next */
(!node.position || !node.position.start)
) {
patch(node)
// @ts-expect-error: we just set it.
node.position.start = child.position.start
}
if (
index < 1 &&
/* c8 ignore next */
(!node.position || !node.position.start)
) {
patch(node)
node.position.start = child.position.start
if (
index === siblings.length - 1 &&
(!node.position || !node.position.end)
) {
patch(node)
// @ts-expect-error: we just set it.
node.position.end = child.position.end
}
}
)
if (index === siblings.length - 1 && (!node.position || !node.position.end)) {
patch(node)
node.position.end = child.position.end
}
})
// Add a `position` object when it does not yet exist on `node`.
/**
* @param {Node} node
*/
function patch(node) {
if (!node.position) {
// @ts-expect-error: fine.
node.position = {}
}
}

@@ -0,12 +1,23 @@

/**
* @typedef {import('nlcst').Root} Root
* @typedef {import('nlcst').Paragraph} Paragraph
*/
import {modifyChildren} from 'unist-util-modify-children'
// Remove empty children.
export const removeEmptyNodes = modifyChildren(function (child, index, parent) {
if ('children' in child && child.children.length === 0) {
parent.children.splice(index, 1)
export const removeEmptyNodes = modifyChildren(
/**
* @type {import('unist-util-modify-children').Modifier<Root | Paragraph>}
*/
// Next, iterate over the node *now* at the current position (which was the
// next node).
return index
function (child, index, parent) {
if ('children' in child && child.children.length === 0) {
parent.children.splice(index, 1)
// Next, iterate over the node *now* at the current position (which was the
// next node).
return index
}
}
})
)
{
"name": "parse-latin",
"version": "5.0.1",
"version": "6.0.0",
"description": "Latin-script (natural language) parser",

@@ -27,16 +27,22 @@ "license": "MIT",

"main": "index.js",
"types": "index.d.ts",
"files": [
"lib/",
"index.d.ts",
"index.js"
],
"dependencies": {
"@types/nlcst": "^1.0.0",
"@types/unist": "^2.0.0",
"nlcst-to-string": "^3.0.0",
"unist-util-modify-children": "^3.0.0",
"unist-util-visit-children": "^2.0.0"
"unist-util-visit-children": "^2.0.0",
"vfile": "^5.0.0"
},
"devDependencies": {
"@types/node": "^18.0.0",
"@types/regenerate": "^1.0.0",
"@unicode/unicode-13.0.0": "^1.0.0",
"c8": "^7.0.0",
"is-hidden": "^2.0.0",
"negate": "^1.0.0",
"nlcst-test": "^3.0.0",

@@ -48,9 +54,11 @@ "nyc": "^15.0.0",

"remark-preset-wooorm": "^9.0.0",
"type-coverage": "^2.0.0",
"typescript": "^4.0.0",
"unist-util-remove-position": "^4.0.0",
"vfile": "^5.0.0",
"xo": "^0.52.0"
},
"scripts": {
"prepack": "npm run generate && npm run format",
"prepack": "npm run generate && npm run build && npm run format",
"fixture": "node script/generate-fixture.js",
"build": "tsc --build --clean && tsc --build && type-coverage",
"generate": "node script/build-expressions.js",

@@ -60,3 +68,3 @@ "format": "remark . -qfo && prettier . -w --loglevel warn && xo --fix",

"test-coverage": "c8 --check-coverage --100 --reporter lcov npm run test-api",
"test": "npm run generate && npm run format && npm run test-coverage"
"test": "npm run generate && npm run build && npm run format && npm run test-coverage"
},

@@ -82,3 +90,8 @@ "prettier": {

]
},
"typeCoverage": {
"atLeast": 100,
"detail": true,
"strict": true
}
}

@@ -7,22 +7,50 @@ # parse-latin

[![Size][size-badge]][size]
[![Chat][chat-badge]][chat]
A Latin-script language parser for [**retext**][retext] producing **[nlcst][]**
nodes.
A natural language parser, for Latin-script languages, that produces [nlcst][].
## Contents
* [What is this?](#what-is-this)
* [When should I use this?](#when-should-i-use-this)
* [Install](#install)
* [Use](#use)
* [API](#api)
* [`ParseLatin()`](#parselatin)
* [Algorithm](#algorithm)
* [Types](#types)
* [Compatibility](#compatibility)
* [Related](#related)
* [Contribute](#contribute)
* [Security](#security)
* [License](#license)
## What is this?
This package exposes a parser that takes Latin-script natural language and
produces a syntax tree.
## When should I use this?
If you want to handle natural language as syntax trees manually, use this.
Alternatively, you can use the retext plugin [`retext-latin`][retext-latin],
which wraps this project to also parse natural language at a higher-level
(easier) abstraction.
Whether Old-English (“þā gewearþ þǣm hlāforde and þǣm hȳrigmannum wiþ ānum
penninge”), Icelandic (“Hvað er að frétta”), French (“Où sont les toilettes?”),
`parse-latin` does a good job at tokenizing it.
this project does a good job at tokenizing it.
Note also that `parse-latin` does a decent job at tokenizing Latin-like scripts,
Cyrillic (“Добро пожаловать!”), Georgian (“როგორა ხარ?”), Armenian (“Շատ հաճելի
է”), and such.
For English and Dutch, you can instead use [`parse-english`][parse-english] and
[`parse-dutch`][parse-dutch].
You can somewhat use this for Latin-like scripts, such as Cyrillic
(“Добро пожаловать!”), Georgian (“როგორა ხარ?”), Armenian (“Շատ հաճելի է”),
and such.
## Install
This package is ESM only: Node 12+ is needed to use it and it must be `import`ed
instead of `require`d.
This package is [ESM only][esm].
In Node.js (version 14.14+, 16.0+), install with [npm][]:
[npm][]:
```sh

@@ -32,2 +60,16 @@ npm install parse-latin

In Deno with [`esm.sh`][esmsh]:
```js
import {ParseLatin} from 'https://esm.sh/parse-latin@6'
```
In browsers with [`esm.sh`][esmsh]:
```html
<script type="module">
import {ParseLatin} from 'https://esm.sh/parse-latin@6?bundle'
</script>
```
## Use

@@ -44,3 +86,3 @@

Which, when inspecting, yields:
Yields:

@@ -64,55 +106,76 @@ ```txt

This package exports the following identifiers: `ParseLatin`.
This package exports the identifier `ParseLatin`.
There is no default export.
### `ParseLatin(value)`
### `ParseLatin()`
Exposes the functionality needed to tokenize natural Latin-script languages into
a syntax tree.
If `value` is passed here, it’s not needed to give it to `#parse()`.
Create a new parser.
#### `ParseLatin#tokenize(value)`
#### `ParseLatin#parse(value)`
Tokenize `value` (`string`) into letters and numbers (words), white space, and
everything else (punctuation).
The returned nodes are a flat list without paragraphs or sentences.
Turn natural language into a syntax tree.
###### Returns
##### Parameters
[`Array.<Node>`][nlcst] — Nodes.
###### `value`
#### `ParseLatin#parse(value)`
Value to parse (`string`).
Tokenize `value` (`string`) into an [NLCST][] tree.
The returned node is a `RootNode` with in it paragraphs and sentences.
##### Returns
###### Returns
[`RootNode`][root].
[`Node`][nlcst] — Root node.
## Algorithm
> Note: The easiest way to see **how parse-latin tokenizes and parses**, is by
> using the [online parser demo][demo], which
> shows the syntax tree corresponding to the typed text.
> 👉 **Note**:
> The easiest way to see how `parse-latin` parses, is by using the
> [online parser demo][demo], which shows the syntax tree corresponding to
> the typed text.
`parse-latin` splits text into white space, word, and punctuation tokens.
`parse-latin` starts out with a pretty easy definition, one that most other
tokenizers use:
`parse-latin` splits text into white space, punctuation, symbol, and word
tokens:
* A “word” is one or more letter or number characters
* A “white space” is one or more white space characters
* A “punctuation” is one or more of anything else
* “word” is one or more unicode letters or numbers
* “white space” is one or more unicode white space characters
* “punctuation” is one or more unicode punctuation characters
* “symbol” is one or more of anything else
Then, it manipulates and merges those tokens into a ([nlcst][]) syntax tree,
adding sentences and paragraphs where needed.
Then, it manipulates and merges those tokens into a syntax tree, adding
sentences and paragraphs where needed.
* Some punctuation marks are part of the word they occur in, such as
* some punctuation marks are part of the word they occur in, such as
`non-profit`, `she’s`, `G.I.`, `11:00`, `N/A`, `&c`, `nineteenth- and…`
* Some full-stops do not mark a sentence end, such as `1.`, `e.g.`, `id.`
* Although full-stops, question marks, and exclamation marks (sometimes) end a
* some periods do not mark a sentence end, such as `1.`, `e.g.`, `id.`
* although periods, question marks, and exclamation marks (sometimes) end a
sentence, that end might not occur directly after the mark, such as `.)`,
`."`
* And many more exceptions
* …and many more exceptions
## Types
This package is fully typed with [TypeScript][].
It exports no additional types.
## Compatibility
This package is at least compatible with all maintained versions of Node.js.
As of now, that is Node.js 14.14+ and 16.0+.
It also works in Deno and modern browsers.
## Related
* [`parse-english`](https://github.com/wooorm/parse-english)
— English (natural language) parser
* [`parse-dutch`](https://github.com/wooorm/parse-dutch)
— Dutch (natural language) parser
## Contribute
Yes please!
See [How to Contribute to Open Source][contribute].
## Security
This package is safe.
## License

@@ -140,6 +203,2 @@

[chat-badge]: https://img.shields.io/badge/join%20the%20community-on%20spectrum-7b16ff.svg
[chat]: https://spectrum.chat/unified/retext
[npm]: https://docs.npmjs.com/cli/install

@@ -149,2 +208,10 @@

[esm]: https://gist.github.com/sindresorhus/a39789f98801d908bbc7ff3ecc99d99c
[esmsh]: https://esm.sh
[typescript]: https://www.typescriptlang.org
[contribute]: https://opensource.guide/how-to-contribute/
[license]: license

@@ -154,4 +221,10 @@

[retext]: https://github.com/retextjs/retext
[nlcst]: https://github.com/syntax-tree/nlcst
[nlcst]: https://github.com/syntax-tree/nlcst
[root]: https://github.com/syntax-tree/nlcst#root
[retext-latin]: https://github.com/retextjs/retext/tree/main/packages/retext-latin
[parse-english]: https://github.com/wooorm/parse-english
[parse-dutch]: https://github.com/wooorm/parse-dutch
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc