@diplodoc/sentenizer
Advanced tools
Comparing version 0.0.6 to 0.0.7
@@ -1,1 +0,1 @@ | ||
export { sentenize } from './sentenize'; | ||
export declare function sentenize(text: string): string[]; |
358
lib/index.js
@@ -25,4 +25,2 @@ var __defProp = Object.defineProperty; | ||
module.exports = __toCommonJS(src_exports); | ||
// src/sentenize.ts | ||
var import_ramda8 = require("ramda"); | ||
@@ -33,167 +31,11 @@ | ||
// src/constants.ts | ||
// src/constants/markers.ts | ||
var SENTENCE_END_MARKERS = ".?!\u2026"; | ||
var QUOTATION_GENERIC_MARKERS = `\xAB"\u201E'`; | ||
var QUOTATION_CLOSE_MARKERS = '\xBB"\u201D\u2019'; | ||
var QUOTATION_GENERIC_MARKERS = `"\u201E'`; | ||
var QUOTATION_CLOSE_MARKERS = "\xBB\u201D\u2019"; | ||
var BRACKETS_CLOSE_MARKERS = "\\)\\]\\}>"; | ||
// src/constants/parameters.ts | ||
var WINDOW_WIDTH = 10; | ||
// src/lenses/index.ts | ||
var import_ramda = require("ramda"); | ||
var first = () => (0, import_ramda.lensIndex)(0); | ||
var second = () => (0, import_ramda.lensIndex)(1); | ||
var last = () => (0, import_ramda.lensIndex)(-1); | ||
// src/parsers/index.ts | ||
var firstString = first(); | ||
var fst = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.view)(firstString)); | ||
var secondString = second(); | ||
var snd = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.view)(secondString)); | ||
var lastString = last(); | ||
var lst = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.view)(lastString)); | ||
var sentencePattern = `([^${SENTENCE_END_MARKERS}]*?[${SENTENCE_END_MARKERS}]+)`; | ||
var senteceFlags = "gmu"; | ||
var sentenceRegExp = new RegExp(sentencePattern, senteceFlags); | ||
var sentences = (0, import_ramda2.compose)((0, import_ramda2.filter)(Boolean), (0, import_ramda2.split)(sentenceRegExp)); | ||
var sentenceDelimitersPattern = `([${SENTENCE_END_MARKERS}]+)$`; | ||
var sentenceDelimitersFlags = "gmu"; | ||
var sentenceDelimitersRegExp = new RegExp(sentenceDelimitersPattern, sentenceDelimitersFlags); | ||
var words = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.replace)(sentenceDelimitersRegExp)("")); | ||
var delimiters = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(sentenceDelimitersRegExp)); | ||
var fstTokenPattern = /^\s*([^\s]+?)(?=\s|$)/; | ||
var fstTokenFlags = "mu"; | ||
var fstTokenRegExp = new RegExp(fstTokenPattern, fstTokenFlags); | ||
var fstToken = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(fstTokenRegExp)); | ||
var fstWord = (0, import_ramda2.compose)(fstToken, words); | ||
var lstTokenPattern = /([^\s]+)\s*$/; | ||
var lstTokenFlags = "mu"; | ||
var lstTokenRegExp = new RegExp(lstTokenPattern, lstTokenFlags); | ||
var lstToken = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(lstTokenRegExp)); | ||
var nonAlphaStartPattern = /^[^\wа-яА-Я]*/; | ||
var nonAlphaStartFlags = "gmu"; | ||
var nonAlphaStartRegExp = new RegExp(nonAlphaStartPattern, nonAlphaStartFlags); | ||
var omitNonAlphaStart = (0, import_ramda2.replace)(nonAlphaStartRegExp, ""); | ||
var lstWord = (0, import_ramda2.compose)(lstToken, words); | ||
var fstChars = (width = WINDOW_WIDTH) => { | ||
const fstCharsPattern = `^[\\s\\S]{0,${width}}`; | ||
const fstCharsFlags = "gmu"; | ||
const fstCharsRegExp = new RegExp(fstCharsPattern, fstCharsFlags); | ||
return (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(fstCharsRegExp)); | ||
}; | ||
var lstChars = (width = WINDOW_WIDTH) => { | ||
const lstCharsPattern = `.{0,${width}}$`; | ||
const lstCharsFlags = "gmu"; | ||
const lstCharsRegExp = new RegExp(lstCharsPattern, lstCharsFlags); | ||
return (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(lstCharsRegExp)); | ||
}; | ||
var spacePrefixPattern = /^\s/; | ||
var spacePrefixFlags = "gmu"; | ||
var spacePrefixRegExp = new RegExp(spacePrefixPattern, spacePrefixFlags); | ||
var spacePrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(spacePrefixRegExp)); | ||
var spaceSuffixPattern = /\s$/; | ||
var spaceSuffixFlags = "mu"; | ||
var spaceSuffixRegExp = new RegExp(spaceSuffixPattern, spaceSuffixFlags); | ||
var spaceSuffix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(spaceSuffixRegExp)); | ||
var quotationGenericPrefixPattern = `^([${QUOTATION_GENERIC_MARKERS}]+)`; | ||
var quotationGenericPrefixFlags = "mu"; | ||
var quotationGenericPrefixRegExp = new RegExp( | ||
quotationGenericPrefixPattern, | ||
quotationGenericPrefixFlags | ||
); | ||
var quotationGenericPrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(quotationGenericPrefixRegExp)); | ||
var quotationClosePrefixPattern = `^([${QUOTATION_CLOSE_MARKERS}]+)`; | ||
var quotationClosePrefixFlags = "mu"; | ||
var quotationClosePrefixRegExp = new RegExp( | ||
quotationClosePrefixPattern, | ||
quotationClosePrefixFlags | ||
); | ||
var quotationClosePrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(quotationClosePrefixRegExp)); | ||
var delimiterPrefixPattern = `^([${SENTENCE_END_MARKERS}]+)`; | ||
var delimiterPrefixFlags = "mu"; | ||
var delimiterPrefixRegExp = new RegExp(delimiterPrefixPattern, delimiterPrefixFlags); | ||
var delimiterPrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(delimiterPrefixRegExp)); | ||
var bracketsClosePrefixPattern = `^([${BRACKETS_CLOSE_MARKERS}]+)`; | ||
var bracketsClosePrefixFlags = "mu"; | ||
var bracketsClosePrefixRegExp = new RegExp(bracketsClosePrefixPattern, bracketsClosePrefixFlags); | ||
var bracketsClosePrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(bracketsClosePrefixRegExp)); | ||
var spacesPattern = /^(\s+)$/; | ||
var spacesFlags = "gmu"; | ||
var spacesRegExp = new RegExp(spacesPattern, spacesFlags); | ||
var spaces = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(spacesRegExp)); | ||
var dotSuffixPattern = /[^.](\.)$/; | ||
var dotSuffixFlags = "mu"; | ||
var dotSuffixRegExp = new RegExp(dotSuffixPattern, dotSuffixFlags); | ||
var dotSuffix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(dotSuffixRegExp)); | ||
// src/rules/base.ts | ||
var import_ramda5 = require("ramda"); | ||
// src/utilities/list.ts | ||
var import_ramda3 = require("ramda"); | ||
var lenLte = (len) => (0, import_ramda3.compose)((0, import_ramda3.curry)((0, import_ramda3.flip)(import_ramda3.lte))(len), import_ramda3.length); | ||
var allEqual = (0, import_ramda3.compose)(lenLte(1), import_ramda3.uniq); | ||
var lengthNonZero = (0, import_ramda3.compose)(Boolean, import_ramda3.length); | ||
// src/utilities/string.ts | ||
var import_ramda4 = require("ramda"); | ||
var charAt = (0, import_ramda4.invoker)(1, "charAt"); | ||
var notAlpha = (0, import_ramda4.compose)(allEqual, (0, import_ramda4.juxt)([import_ramda4.toLower, import_ramda4.toUpper])); | ||
var hasAlpha = (0, import_ramda4.compose)(import_ramda4.not, notAlpha); | ||
var startsWithLower = (0, import_ramda4.allPass)([ | ||
(0, import_ramda4.compose)((0, import_ramda4.compose)(import_ramda4.not, (0, import_ramda4.match)(/\n/)), charAt(0)), | ||
(0, import_ramda4.compose)((0, import_ramda4.compose)(import_ramda4.not, notAlpha), charAt(0)), | ||
(0, import_ramda4.compose)(allEqual, (0, import_ramda4.juxt)([import_ramda4.identity, import_ramda4.toLower]), charAt(0)) | ||
]); | ||
var isUpper = (0, import_ramda4.compose)(allEqual, (0, import_ramda4.juxt)([import_ramda4.toUpper, import_ramda4.identity])); | ||
// src/rules/base.ts | ||
var isSpaceSuffix = (0, import_ramda5.compose)(lengthNonZero, spaceSuffix); | ||
var isSpacePrefix = (0, import_ramda5.compose)(lengthNonZero, spacePrefix); | ||
var spaceBothSides = (0, import_ramda5.compose)( | ||
(0, import_ramda5.all)(Boolean), | ||
(0, import_ramda5.zipWith)(import_ramda5.call, [isSpaceSuffix, isSpacePrefix]), | ||
(0, import_ramda5.map)(words) | ||
); | ||
var rightLacksSpacePrefix = (0, import_ramda5.compose)( | ||
(0, import_ramda5.all)(Boolean), | ||
(0, import_ramda5.zipWith)(import_ramda5.call, [(0, import_ramda5.always)(true), (0, import_ramda5.compose)(import_ramda5.not, isSpacePrefix)]), | ||
(0, import_ramda5.map)(words) | ||
); | ||
var rightStartsWithLowercase = (0, import_ramda5.compose)( | ||
(0, import_ramda5.all)(Boolean), | ||
(0, import_ramda5.zipWith)(import_ramda5.call, [(0, import_ramda5.always)(true), (0, import_ramda5.compose)(startsWithLower, fstToken)]) | ||
); | ||
var rightDelimiterPrefix = (0, import_ramda5.compose)( | ||
(0, import_ramda5.all)(Boolean), | ||
(0, import_ramda5.zipWith)(import_ramda5.call, [(0, import_ramda5.always)(true), (0, import_ramda5.compose)(lengthNonZero, delimiterPrefix, fstToken)]) | ||
); | ||
var rightQuotationGenericPrefix = (0, import_ramda5.compose)( | ||
(0, import_ramda5.all)(Boolean), | ||
(0, import_ramda5.zipWith)(import_ramda5.call, [(0, import_ramda5.always)(true), (0, import_ramda5.compose)(lengthNonZero, quotationGenericPrefix)]) | ||
); | ||
var rightQuotationClosePrefix = (0, import_ramda5.compose)( | ||
(0, import_ramda5.all)(Boolean), | ||
(0, import_ramda5.zipWith)(import_ramda5.call, [(0, import_ramda5.always)(true), (0, import_ramda5.compose)(lengthNonZero, quotationClosePrefix, fstToken)]) | ||
); | ||
var rightBracketsClosePrefix = (0, import_ramda5.compose)( | ||
(0, import_ramda5.all)(Boolean), | ||
(0, import_ramda5.zipWith)(import_ramda5.call, [(0, import_ramda5.always)(true), (0, import_ramda5.compose)(lengthNonZero, bracketsClosePrefix, fstToken)]) | ||
); | ||
var rightOnlySpaces = (0, import_ramda5.compose)( | ||
(0, import_ramda5.all)(Boolean), | ||
(0, import_ramda5.zipWith)(import_ramda5.call, [(0, import_ramda5.always)(true), (0, import_ramda5.compose)(lengthNonZero, spaces)]) | ||
); | ||
// src/rules/initials.ts | ||
var import_ramda6 = require("ramda"); | ||
var isLeftDotDelimiter = (0, import_ramda6.compose)(lengthNonZero, dotSuffix); | ||
var isLeftSingleLetter = (0, import_ramda6.compose)((0, import_ramda6.equals)(1), import_ramda6.length, lstWord); | ||
var isLeftUpper = (0, import_ramda6.compose)(allEqual, (0, import_ramda6.juxt)([import_ramda6.toUpper, import_ramda6.identity]), lstWord); | ||
var leftHasAlpha = (0, import_ramda6.compose)(hasAlpha, lstWord); | ||
var isLeftInitials = (0, import_ramda6.allPass)([isLeftDotDelimiter, isLeftSingleLetter, isLeftUpper, leftHasAlpha]); | ||
var leftInitials = (0, import_ramda6.compose)((0, import_ramda6.all)(Boolean), (0, import_ramda6.zipWith)(import_ramda6.call, [isLeftInitials, (0, import_ramda6.always)(true)])); | ||
// src/rules/abbreviations.ts | ||
var import_ramda7 = require("ramda"); | ||
// src/constants/abbreviations.ts | ||
@@ -444,3 +286,153 @@ var INITIALS = { | ||
// src/lenses/index.ts | ||
var import_ramda = require("ramda"); | ||
var first = () => (0, import_ramda.lensIndex)(0); | ||
var second = () => (0, import_ramda.lensIndex)(1); | ||
var last = () => (0, import_ramda.lensIndex)(-1); | ||
// src/parsers/index.ts | ||
var firstString = first(); | ||
var fst = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.view)(firstString)); | ||
var secondString = second(); | ||
var snd = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.view)(secondString)); | ||
var lastString = last(); | ||
var lst = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.view)(lastString)); | ||
var sentencePattern = `([^${SENTENCE_END_MARKERS}]*?[${SENTENCE_END_MARKERS}]+)`; | ||
var senteceFlags = "gmu"; | ||
var sentenceRegExp = new RegExp(sentencePattern, senteceFlags); | ||
var sentences = (0, import_ramda2.compose)((0, import_ramda2.filter)(Boolean), (0, import_ramda2.split)(sentenceRegExp)); | ||
var sentenceDelimitersPattern = `([${SENTENCE_END_MARKERS}]+)$`; | ||
var sentenceDelimitersFlags = "gmu"; | ||
var sentenceDelimitersRegExp = new RegExp(sentenceDelimitersPattern, sentenceDelimitersFlags); | ||
var words = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.replace)(sentenceDelimitersRegExp)("")); | ||
var delimiters = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(sentenceDelimitersRegExp)); | ||
var fstTokenPattern = /^\s*([^\s]+?)(?=\s|$)/; | ||
var fstTokenFlags = "mu"; | ||
var fstTokenRegExp = new RegExp(fstTokenPattern, fstTokenFlags); | ||
var fstToken = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(fstTokenRegExp)); | ||
var fstWord = (0, import_ramda2.compose)(fstToken, words); | ||
var lstTokenPattern = /([^\s]+)\s*$/; | ||
var lstTokenFlags = "mu"; | ||
var lstTokenRegExp = new RegExp(lstTokenPattern, lstTokenFlags); | ||
var lstToken = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(lstTokenRegExp)); | ||
var nonAlphaStartPattern = /^[^\wа-яА-Я]*/; | ||
var nonAlphaStartFlags = "gmu"; | ||
var nonAlphaStartRegExp = new RegExp(nonAlphaStartPattern, nonAlphaStartFlags); | ||
var omitNonAlphaStart = (0, import_ramda2.replace)(nonAlphaStartRegExp, ""); | ||
var lstWord = (0, import_ramda2.compose)(lstToken, words); | ||
var fstChars = (width = WINDOW_WIDTH) => { | ||
const fstCharsPattern = `^[\\s\\S]{0,${width}}`; | ||
const fstCharsFlags = "gmu"; | ||
const fstCharsRegExp = new RegExp(fstCharsPattern, fstCharsFlags); | ||
return (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(fstCharsRegExp)); | ||
}; | ||
var lstChars = (width = WINDOW_WIDTH) => { | ||
const lstCharsPattern = `.{0,${width}}$`; | ||
const lstCharsFlags = "gmu"; | ||
const lstCharsRegExp = new RegExp(lstCharsPattern, lstCharsFlags); | ||
return (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(lstCharsRegExp)); | ||
}; | ||
var spacePrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(/^\s/)); | ||
var spaceSuffix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(/\s$/)); | ||
var quotationGenericPrefixPattern = `^([${QUOTATION_GENERIC_MARKERS}]+)`; | ||
var quotationGenericPrefixFlags = "mu"; | ||
var quotationGenericPrefixRegExp = new RegExp( | ||
quotationGenericPrefixPattern, | ||
quotationGenericPrefixFlags | ||
); | ||
var quotationGenericPrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(quotationGenericPrefixRegExp)); | ||
var quotationClosePrefixPattern = `^([${QUOTATION_CLOSE_MARKERS}]+)`; | ||
var quotationClosePrefixFlags = "mu"; | ||
var quotationClosePrefixRegExp = new RegExp( | ||
quotationClosePrefixPattern, | ||
quotationClosePrefixFlags | ||
); | ||
var quotationClosePrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(quotationClosePrefixRegExp)); | ||
var delimiterPrefixPattern = `^([${SENTENCE_END_MARKERS}]+)`; | ||
var delimiterPrefixFlags = "mu"; | ||
var delimiterPrefixRegExp = new RegExp(delimiterPrefixPattern, delimiterPrefixFlags); | ||
var delimiterPrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(delimiterPrefixRegExp)); | ||
var bracketsClosePrefixPattern = `^([${BRACKETS_CLOSE_MARKERS}]+)`; | ||
var bracketsClosePrefixFlags = "mu"; | ||
var bracketsClosePrefixRegExp = new RegExp(bracketsClosePrefixPattern, bracketsClosePrefixFlags); | ||
var bracketsClosePrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(bracketsClosePrefixRegExp)); | ||
var spacesPattern = /^(\s+)$/; | ||
var spacesFlags = "gmu"; | ||
var spacesRegExp = new RegExp(spacesPattern, spacesFlags); | ||
var spaces = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(spacesRegExp)); | ||
var dotSuffixPattern = /[^.](\.)$/; | ||
var dotSuffixFlags = "mu"; | ||
var dotSuffixRegExp = new RegExp(dotSuffixPattern, dotSuffixFlags); | ||
var dotSuffix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(dotSuffixRegExp)); | ||
// src/rules/base.ts | ||
var import_ramda5 = require("ramda"); | ||
// src/utilities/list.ts | ||
var import_ramda3 = require("ramda"); | ||
var lenLte = (len) => (0, import_ramda3.compose)((0, import_ramda3.curry)((0, import_ramda3.flip)(import_ramda3.lte))(len), import_ramda3.length); | ||
var allEqual = (0, import_ramda3.compose)(lenLte(1), import_ramda3.uniq); | ||
var lengthNonZero = (0, import_ramda3.compose)(Boolean, import_ramda3.length); | ||
// src/utilities/string.ts | ||
var import_ramda4 = require("ramda"); | ||
var charAt = (0, import_ramda4.invoker)(1, "charAt"); | ||
var notAlpha = (0, import_ramda4.compose)(allEqual, (0, import_ramda4.juxt)([import_ramda4.toLower, import_ramda4.toUpper])); | ||
var hasAlpha = (0, import_ramda4.compose)(import_ramda4.not, notAlpha); | ||
var startsWithLower = (0, import_ramda4.allPass)([ | ||
(0, import_ramda4.compose)(hasAlpha, charAt(0)), | ||
(0, import_ramda4.compose)(allEqual, (0, import_ramda4.juxt)([import_ramda4.identity, import_ramda4.toLower]), charAt(0)) | ||
]); | ||
var startsWithUpper = (0, import_ramda4.allPass)([ | ||
(0, import_ramda4.compose)(hasAlpha, charAt(0)), | ||
(0, import_ramda4.compose)(allEqual, (0, import_ramda4.juxt)([import_ramda4.identity, import_ramda4.toUpper]), charAt(0)) | ||
]); | ||
var startsWithNewline = (0, import_ramda4.compose)(lengthNonZero, (0, import_ramda4.match)(/^\n/)); | ||
var startsWithHardbreak = (0, import_ramda4.compose)(lengthNonZero, (0, import_ramda4.match)(/^\n\n/)); | ||
var endsWithHardbreak = (0, import_ramda4.compose)(lengthNonZero, (0, import_ramda4.match)(/\n\n$/)); | ||
var isUpper = (0, import_ramda4.compose)(allEqual, (0, import_ramda4.juxt)([import_ramda4.toUpper, import_ramda4.identity])); | ||
// src/rules/base.ts | ||
var isSpaceSuffix = (0, import_ramda5.compose)(lengthNonZero, spaceSuffix); | ||
var isSpacePrefix = (0, import_ramda5.compose)(lengthNonZero, spacePrefix); | ||
var log = (name, action) => { | ||
return (...args) => { | ||
const result = action(...args); | ||
if (process.env.DEBUG) { | ||
console.log(name, args, result); | ||
} | ||
return result; | ||
}; | ||
}; | ||
var _ = (0, import_ramda5.always)(true); | ||
var rule = (name, [left, right], remap = import_ramda5.identity) => { | ||
return log(name, (0, import_ramda5.compose)( | ||
(0, import_ramda5.all)(Boolean), | ||
(0, import_ramda5.zipWith)(import_ramda5.call, [left, right]), | ||
(0, import_ramda5.map)(remap) | ||
)); | ||
}; | ||
var spaceBothSides = rule("spaceBothSides", [isSpaceSuffix, isSpacePrefix], words); | ||
var rightLacksSpacePrefix = rule("rightLacksSpacePrefix", [_, (0, import_ramda5.compose)(import_ramda5.not, isSpacePrefix)], words); | ||
var rightStartsWithLowercase = rule("rightStartsWithLowercase", [_, (0, import_ramda5.compose)(startsWithLower, fstToken)]); | ||
var rightDelimiterPrefix = rule("rightDelimiterPrefix", [_, (0, import_ramda5.compose)(lengthNonZero, delimiterPrefix, fstToken)]); | ||
var rightQuotationGenericPrefix = rule("rightQuotationGenericPrefix", [_, (0, import_ramda5.compose)(lengthNonZero, quotationGenericPrefix)]); | ||
var rightQuotationClosePrefix = rule("rightQuotationClosePrefix", [_, (0, import_ramda5.compose)(lengthNonZero, quotationClosePrefix, fstToken)]); | ||
var rightBracketsClosePrefix = rule("rightBracketsClosePrefix", [_, (0, import_ramda5.compose)(lengthNonZero, bracketsClosePrefix, fstToken)]); | ||
var rightOnlySpaces = rule("rightOnlySpaces", [_, (0, import_ramda5.compose)(lengthNonZero, spaces)]); | ||
var leftEndsWithHardbreak = rule("leftEndsWithHardbreak", [endsWithHardbreak, _]); | ||
var rightStartsWithHardbreak = rule("rightStartsWithHardbreak", [_, startsWithHardbreak]); | ||
var rightStartsNewlineUppercased = rule("rightStartsNewlineUppercased", [_, (0, import_ramda5.allPass)([startsWithNewline, startsWithUpper])]); | ||
// src/rules/initials.ts | ||
var import_ramda6 = require("ramda"); | ||
var isLeftDotDelimiter = (0, import_ramda6.compose)(lengthNonZero, dotSuffix); | ||
var isLeftSingleLetter = (0, import_ramda6.compose)((0, import_ramda6.equals)(1), import_ramda6.length, lstWord); | ||
var isLeftUpper = (0, import_ramda6.compose)(allEqual, (0, import_ramda6.juxt)([import_ramda6.toUpper, import_ramda6.identity]), lstWord); | ||
var leftHasAlpha = (0, import_ramda6.compose)(hasAlpha, lstWord); | ||
var isLeftInitials = (0, import_ramda6.allPass)([isLeftDotDelimiter, isLeftSingleLetter, isLeftUpper, leftHasAlpha]); | ||
var leftInitials = (0, import_ramda6.compose)((0, import_ramda6.all)(Boolean), (0, import_ramda6.zipWith)(import_ramda6.call, [isLeftInitials, (0, import_ramda6.always)(true)])); | ||
// src/rules/abbreviations.ts | ||
var import_ramda7 = require("ramda"); | ||
var fst2 = (0, import_ramda7.compose)((0, import_ramda7.defaultTo)(""), (0, import_ramda7.view)(first())); | ||
@@ -504,3 +496,3 @@ var snd2 = (0, import_ramda7.compose)((0, import_ramda7.defaultTo)(""), (0, import_ramda7.view)(second())); | ||
// src/sentenize.ts | ||
// src/index.ts | ||
var leftPreprocessor = lstChars(20); | ||
@@ -523,24 +515,32 @@ var rightPreprocessor = fstChars(20); | ||
]); | ||
var breakCondition = (0, import_ramda8.anyPass)([ | ||
leftEndsWithHardbreak, | ||
rightStartsWithHardbreak, | ||
rightStartsNewlineUppercased | ||
]); | ||
var join2 = (0, import_ramda8.compose)(joinCondition, (0, import_ramda8.zipWith)(import_ramda8.call, sidesPreprocessors)); | ||
function processor(text) { | ||
const chunks = sentences(text); | ||
let left = null; | ||
var breaks = (0, import_ramda8.compose)(breakCondition, (0, import_ramda8.zipWith)(import_ramda8.call, sidesPreprocessors)); | ||
function sentenize(text) { | ||
const parts = text.split(/(\n{2,})/); | ||
const parsed = []; | ||
for (let i = 0; i < chunks.length; i++) { | ||
if (!left) { | ||
left = chunks[i]; | ||
continue; | ||
for (const part of parts) { | ||
const chunks = sentences(part); | ||
let left = null; | ||
for (const right of chunks) { | ||
if (!left) { | ||
left = right; | ||
continue; | ||
} | ||
if (!breaks([left, right]) && join2([left, right])) { | ||
left += right; | ||
} else { | ||
parsed.push(left); | ||
left = right; | ||
} | ||
} | ||
if (join2([left, chunks[i]])) { | ||
left += chunks[i]; | ||
} else { | ||
if (left) | ||
parsed.push(left); | ||
left = chunks[i]; | ||
} | ||
} | ||
if (left) | ||
parsed.push(left); | ||
return parsed; | ||
} | ||
var sentenize = processor; | ||
// Annotate the CommonJS export names for ESM import in node: | ||
@@ -547,0 +547,0 @@ 0 && (module.exports = { |
import { Pred } from 'ramda'; | ||
declare const pairAbbreviation: Pred<any[]> | ((obj: string[]) => boolean); | ||
declare const leftAbbreviation: (obj: string[]) => boolean; | ||
declare const leftPairsTailAbbreviation: Pred<any[]> | ((obj: string[]) => boolean); | ||
export { pairAbbreviation, leftAbbreviation, leftPairsTailAbbreviation }; | ||
export declare const pairAbbreviation: Pred<any[]> | ((obj: string[]) => boolean); | ||
export declare const leftAbbreviation: (obj: string[]) => boolean; | ||
export declare const leftPairsTailAbbreviation: Pred<any[]> | ((obj: string[]) => boolean); |
@@ -1,9 +0,11 @@ | ||
declare const spaceBothSides: (list: readonly string[]) => boolean; | ||
declare const rightLacksSpacePrefix: (list: readonly string[]) => boolean; | ||
declare const rightStartsWithLowercase: (list2: readonly unknown[]) => boolean; | ||
declare const rightDelimiterPrefix: (list2: readonly unknown[]) => boolean; | ||
declare const rightQuotationGenericPrefix: (list2: readonly unknown[]) => boolean; | ||
declare const rightQuotationClosePrefix: (list2: readonly unknown[]) => boolean; | ||
declare const rightBracketsClosePrefix: (list2: readonly unknown[]) => boolean; | ||
declare const rightOnlySpaces: (list2: readonly unknown[]) => boolean; | ||
export { spaceBothSides, rightLacksSpacePrefix, rightStartsWithLowercase, rightDelimiterPrefix, rightQuotationGenericPrefix, rightQuotationClosePrefix, rightBracketsClosePrefix, rightOnlySpaces, }; | ||
export declare const spaceBothSides: (...args: any[]) => any; | ||
export declare const rightLacksSpacePrefix: (...args: any[]) => any; | ||
export declare const rightStartsWithLowercase: (...args: any[]) => any; | ||
export declare const rightDelimiterPrefix: (...args: any[]) => any; | ||
export declare const rightQuotationGenericPrefix: (...args: any[]) => any; | ||
export declare const rightQuotationClosePrefix: (...args: any[]) => any; | ||
export declare const rightBracketsClosePrefix: (...args: any[]) => any; | ||
export declare const rightOnlySpaces: (...args: any[]) => any; | ||
export declare const leftEndsWithHardbreak: (...args: any[]) => any; | ||
export declare const rightStartsWithHardbreak: (...args: any[]) => any; | ||
export declare const rightStartsNewlineUppercased: (...args: any[]) => any; |
@@ -1,3 +0,3 @@ | ||
export { spaceBothSides, rightLacksSpacePrefix, rightStartsWithLowercase, rightDelimiterPrefix, rightQuotationGenericPrefix, rightQuotationClosePrefix, rightBracketsClosePrefix, rightOnlySpaces, } from './base'; | ||
export { leftInitials } from './initials'; | ||
export { leftAbbreviation, pairAbbreviation, leftPairsTailAbbreviation } from './abbreviations'; | ||
export * from './base'; | ||
export * from './initials'; | ||
export * from './abbreviations'; |
@@ -1,2 +0,1 @@ | ||
declare const leftInitials: (list2: readonly unknown[]) => boolean; | ||
export { leftInitials }; | ||
export declare const leftInitials: (list2: readonly unknown[]) => boolean; |
@@ -1,2 +0,2 @@ | ||
export { lenLte, allEqual, lengthNonZero } from './list'; | ||
export { charAt, notAlpha, hasAlpha, startsWithLower, isUpper } from './string'; | ||
export * from './list'; | ||
export * from './string'; |
@@ -1,4 +0,3 @@ | ||
declare const lenLte: (len: number) => (...args: any[][]) => boolean; | ||
declare const allEqual: (list: readonly unknown[]) => boolean; | ||
declare const lengthNonZero: (list: any) => boolean; | ||
export { lenLte, allEqual, lengthNonZero }; | ||
export declare const lenLte: (len: number) => (...args: any[][]) => boolean; | ||
export declare const allEqual: (list: readonly unknown[]) => boolean; | ||
export declare const lengthNonZero: (list: any) => boolean; |
import { Pred } from 'ramda'; | ||
declare const charAt: (...args: unknown[]) => any; | ||
declare const notAlpha: (str: string) => boolean; | ||
declare const hasAlpha: (str: string) => boolean; | ||
declare const startsWithLower: Pred<any[]>; | ||
declare const isUpper: (a: string) => boolean; | ||
export { charAt, notAlpha, hasAlpha, startsWithLower, isUpper }; | ||
export declare const charAt: (...args: unknown[]) => any; | ||
export declare const notAlpha: (str: string) => boolean; | ||
export declare const hasAlpha: (str: string) => boolean; | ||
export declare const startsWithLower: Pred<any[]>; | ||
export declare const startsWithUpper: Pred<any[]>; | ||
export declare const startsWithNewline: Pred<any[]>; | ||
export declare const startsWithHardbreak: Pred<any[]>; | ||
export declare const endsWithHardbreak: Pred<any[]>; | ||
export declare const isUpper: (a: string) => boolean; |
{ | ||
"name": "@diplodoc/sentenizer", | ||
"version": "0.0.6", | ||
"version": "0.0.7", | ||
"description": "text segmentation into sentences", | ||
@@ -5,0 +5,0 @@ "homepage": "https://github.com/diplodoc-platform/sentenizer", |
Environment variable access
Supply chain riskPackage accesses environment variables, which may be a sign of credential stuffing or data theft.
Found 1 instance in 1 package
30359
18
640
2