@diplodoc/sentenizer
Advanced tools
Comparing version 0.0.7 to 0.0.8
@@ -5,5 +5,5 @@ var __defProp = Object.defineProperty; | ||
var __hasOwnProp = Object.prototype.hasOwnProperty; | ||
var __export = (target, all3) => { | ||
for (var name in all3) | ||
__defProp(target, name, { get: all3[name], enumerable: true }); | ||
var __export = (target, all2) => { | ||
for (var name in all2) | ||
__defProp(target, name, { get: all2[name], enumerable: true }); | ||
}; | ||
@@ -26,3 +26,3 @@ var __copyProps = (to, from, except, desc) => { | ||
module.exports = __toCommonJS(src_exports); | ||
var import_ramda8 = require("ramda"); | ||
var import_ramda7 = require("ramda"); | ||
@@ -184,4 +184,6 @@ // src/parsers/index.ts | ||
// гос. экзамены | ||
"\u043E\u0442\u043C": true | ||
"\u043E\u0442\u043C": true, | ||
// от отм. 0.000 | ||
"\u0434\u043E\u0431": true | ||
// доб. 1243 (телефон) | ||
}; | ||
@@ -396,2 +398,6 @@ var TAIL = { | ||
// src/rules/base.ts | ||
var isLeftDotDelimiter = (0, import_ramda5.compose)(lengthNonZero, dotSuffix); | ||
var isLeftSingleLetter = (0, import_ramda5.compose)((0, import_ramda5.equals)(1), import_ramda5.length, lstWord); | ||
var isLeftUpper = (0, import_ramda5.compose)(allEqual, (0, import_ramda5.juxt)([import_ramda5.toUpper, import_ramda5.identity]), lstWord); | ||
var leftHasAlpha = (0, import_ramda5.compose)(hasAlpha, lstWord); | ||
var isSpaceSuffix = (0, import_ramda5.compose)(lengthNonZero, spaceSuffix); | ||
@@ -427,62 +433,55 @@ var isSpacePrefix = (0, import_ramda5.compose)(lengthNonZero, spacePrefix); | ||
var rightStartsNewlineUppercased = rule("rightStartsNewlineUppercased", [_, (0, import_ramda5.allPass)([startsWithNewline, startsWithUpper])]); | ||
var leftInitials = rule("leftInitials", [(0, import_ramda5.allPass)([isLeftDotDelimiter, isLeftSingleLetter, isLeftUpper, leftHasAlpha]), _]); | ||
// src/rules/initials.ts | ||
// src/rules/abbreviations.ts | ||
var import_ramda6 = require("ramda"); | ||
var isLeftDotDelimiter = (0, import_ramda6.compose)(lengthNonZero, dotSuffix); | ||
var isLeftSingleLetter = (0, import_ramda6.compose)((0, import_ramda6.equals)(1), import_ramda6.length, lstWord); | ||
var isLeftUpper = (0, import_ramda6.compose)(allEqual, (0, import_ramda6.juxt)([import_ramda6.toUpper, import_ramda6.identity]), lstWord); | ||
var leftHasAlpha = (0, import_ramda6.compose)(hasAlpha, lstWord); | ||
var isLeftInitials = (0, import_ramda6.allPass)([isLeftDotDelimiter, isLeftSingleLetter, isLeftUpper, leftHasAlpha]); | ||
var leftInitials = (0, import_ramda6.compose)((0, import_ramda6.all)(Boolean), (0, import_ramda6.zipWith)(import_ramda6.call, [isLeftInitials, (0, import_ramda6.always)(true)])); | ||
// src/rules/abbreviations.ts | ||
var import_ramda7 = require("ramda"); | ||
var fst2 = (0, import_ramda7.compose)((0, import_ramda7.defaultTo)(""), (0, import_ramda7.view)(first())); | ||
var snd2 = (0, import_ramda7.compose)((0, import_ramda7.defaultTo)(""), (0, import_ramda7.view)(second())); | ||
var isDotDelimiter = (0, import_ramda7.compose)(lengthNonZero, dotSuffix); | ||
var hash = (0, import_ramda7.compose)(import_ramda7.toLower, (0, import_ramda7.join)(".")); | ||
var insidePairAbbreviationMap = (0, import_ramda7.anyPass)([ | ||
(0, import_ramda7.prop)(import_ramda7.__, HEAD_PAIR), | ||
(0, import_ramda7.prop)(import_ramda7.__, TAIL_PAIR), | ||
(0, import_ramda7.prop)(import_ramda7.__, OTHER_PAIR) | ||
var fst2 = (0, import_ramda6.compose)((0, import_ramda6.defaultTo)(""), (0, import_ramda6.view)(first())); | ||
var snd2 = (0, import_ramda6.compose)((0, import_ramda6.defaultTo)(""), (0, import_ramda6.view)(second())); | ||
var isDotDelimiter = (0, import_ramda6.compose)(lengthNonZero, dotSuffix); | ||
var hash = (0, import_ramda6.compose)(import_ramda6.toLower, (0, import_ramda6.join)(".")); | ||
var insidePairAbbreviationMap = (0, import_ramda6.anyPass)([ | ||
(0, import_ramda6.prop)(import_ramda6.__, HEAD_PAIR), | ||
(0, import_ramda6.prop)(import_ramda6.__, TAIL_PAIR), | ||
(0, import_ramda6.prop)(import_ramda6.__, OTHER_PAIR) | ||
]); | ||
var isPairAbbreviation = (0, import_ramda7.compose)( | ||
var isPairAbbreviation = (0, import_ramda6.compose)( | ||
insidePairAbbreviationMap, | ||
hash, | ||
(0, import_ramda7.zipWith)(import_ramda7.call, [ | ||
(0, import_ramda7.compose)(omitNonAlphaStart, lstWord, lstToken), | ||
(0, import_ramda7.compose)(fstWord, fstToken) | ||
(0, import_ramda6.zipWith)(import_ramda6.call, [ | ||
(0, import_ramda6.compose)(omitNonAlphaStart, lstWord, lstToken), | ||
(0, import_ramda6.compose)(fstWord, fstToken) | ||
]) | ||
); | ||
var pairAbbreviation = (0, import_ramda7.allPass)([ | ||
(0, import_ramda7.compose)(isDotDelimiter, lstToken, fst2), | ||
var pairAbbreviation = (0, import_ramda6.allPass)([ | ||
(0, import_ramda6.compose)(isDotDelimiter, lstToken, fst2), | ||
isPairAbbreviation | ||
]); | ||
var insideAbbreviationMap = (0, import_ramda7.anyPass)([ | ||
var insideAbbreviationMap = (0, import_ramda6.anyPass)([ | ||
// @ts-ignore | ||
(0, import_ramda7.prop)(import_ramda7.__, INITIALS), | ||
(0, import_ramda6.prop)(import_ramda6.__, INITIALS), | ||
// @ts-ignore | ||
(0, import_ramda7.prop)(import_ramda7.__, HEAD), | ||
(0, import_ramda6.prop)(import_ramda6.__, HEAD), | ||
// @ts-ignore | ||
(0, import_ramda7.prop)(import_ramda7.__, TAIL), | ||
(0, import_ramda6.prop)(import_ramda6.__, TAIL), | ||
// @ts-ignore | ||
(0, import_ramda7.prop)(import_ramda7.__, OTHER) | ||
(0, import_ramda6.prop)(import_ramda6.__, OTHER) | ||
]); | ||
var isLeftAbbreviation = (0, import_ramda7.compose)( | ||
var isLeftAbbreviation = (0, import_ramda6.compose)( | ||
insideAbbreviationMap, | ||
omitNonAlphaStart, | ||
import_ramda7.toLower, | ||
import_ramda6.toLower, | ||
lstWord, | ||
lstToken | ||
); | ||
var leftAbbreviation = (0, import_ramda7.compose)( | ||
(0, import_ramda7.allPass)([(0, import_ramda7.compose)(isDotDelimiter, lstToken), isLeftAbbreviation]), | ||
var leftAbbreviation = (0, import_ramda6.compose)( | ||
(0, import_ramda6.allPass)([(0, import_ramda6.compose)(isDotDelimiter, lstToken), isLeftAbbreviation]), | ||
fst2 | ||
); | ||
var rightLowercaseOrCaps = (0, import_ramda7.compose)((0, import_ramda7.anyPass)([startsWithLower, isUpper]), fstWord, snd2); | ||
var isCaps = (0, import_ramda6.allPass)([isUpper, (0, import_ramda6.compose)((0, import_ramda6.lt)(1), import_ramda6.length)]); | ||
var rightLowercaseOrCaps = (0, import_ramda6.compose)((0, import_ramda6.anyPass)([startsWithLower, isCaps]), fstWord, snd2); | ||
var before = (s) => (t) => s.slice(0, Math.max(s.indexOf(t), 0)); | ||
var isLeftPairsTail = (left) => { | ||
const rest = before(left); | ||
const head = (0, import_ramda7.compose)(words, lstWord, rest, lstWord, lstToken); | ||
return (0, import_ramda7.or)( | ||
const head = (0, import_ramda6.compose)(words, lstWord, rest, lstWord, lstToken); | ||
return (0, import_ramda6.or)( | ||
isPairAbbreviation([head(left), lstWord(left)]), | ||
@@ -492,5 +491,5 @@ isPairAbbreviation(lstWord(left).split(".")) | ||
}; | ||
var leftPairsTailAbbreviation = (0, import_ramda7.allPass)([ | ||
(0, import_ramda7.compose)(isDotDelimiter, lstToken, fst2), | ||
(0, import_ramda7.compose)(isLeftPairsTail, fst2), | ||
var leftPairsTailAbbreviation = (0, import_ramda6.allPass)([ | ||
(0, import_ramda6.compose)(isDotDelimiter, lstToken, fst2), | ||
(0, import_ramda6.compose)(isLeftPairsTail, fst2), | ||
rightLowercaseOrCaps | ||
@@ -503,3 +502,3 @@ ]); | ||
var sidesPreprocessors = [leftPreprocessor, rightPreprocessor]; | ||
var joinCondition = (0, import_ramda8.anyPass)([ | ||
var joinCondition = (0, import_ramda7.anyPass)([ | ||
spaceBothSides, | ||
@@ -518,3 +517,3 @@ rightLacksSpacePrefix, | ||
]); | ||
var breakCondition = (0, import_ramda8.anyPass)([ | ||
var breakCondition = (0, import_ramda7.anyPass)([ | ||
leftEndsWithHardbreak, | ||
@@ -524,6 +523,6 @@ rightStartsWithHardbreak, | ||
]); | ||
var join2 = (0, import_ramda8.compose)(joinCondition, (0, import_ramda8.zipWith)(import_ramda8.call, sidesPreprocessors)); | ||
var breaks = (0, import_ramda8.compose)(breakCondition, (0, import_ramda8.zipWith)(import_ramda8.call, sidesPreprocessors)); | ||
var join2 = (0, import_ramda7.compose)(joinCondition, (0, import_ramda7.zipWith)(import_ramda7.call, sidesPreprocessors)); | ||
var breaks = (0, import_ramda7.compose)(breakCondition, (0, import_ramda7.zipWith)(import_ramda7.call, sidesPreprocessors)); | ||
function sentenize(text) { | ||
const parts = text.split(/(\n{2,})/); | ||
const parts = text.split(/((?:\n\s*){2,})/); | ||
const parsed = []; | ||
@@ -530,0 +529,0 @@ for (const part of parts) { |
@@ -12,1 +12,2 @@ export declare const spaceBothSides: (...args: any[]) => any; | ||
export declare const rightStartsNewlineUppercased: (...args: any[]) => any; | ||
export declare const leftInitials: (...args: any[]) => any; |
export * from './base'; | ||
export * from './initials'; | ||
export * from './abbreviations'; |
{ | ||
"name": "@diplodoc/sentenizer", | ||
"version": "0.0.7", | ||
"version": "0.0.8", | ||
"description": "text segmentation into sentences", | ||
@@ -5,0 +5,0 @@ "homepage": "https://github.com/diplodoc-platform/sentenizer", |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
30297
17
639