compromise
Advanced tools
Comparing version 12.3.0 to 13.0.0-rc1
@@ -11,2 +11,38 @@ compromise uses semver, and pushes to npm frequently | ||
##### 13.0.0 | ||
_major changes to `.export()` and `[capture] group` match-syntax._ | ||
- **[breaking]** move .export() and .load() methods to plugin (compromise-export) | ||
- - change .export() format - this hasn't worked properly since v12. (mis-parsed contractions) see #669 | ||
- **[breaking]** split `compromise-output` into `compromise-html` and `compromise-hash` plugins | ||
- **[breaking]** `.match('foo [bar]')` no-longer returns 'bar'. (use `.match('foo [bar]', 0)`) | ||
- **[breaking]** change `.sentences()` method to return only full-sentences of matches (use `.all()` instead) | ||
modifications: | ||
- fix nlp.clone() - hasn't worked properly, since v12. (@Drache93) | ||
- fix issues with greedy capture [*] and [.+] -(@Drache93) 💛 | ||
- add whitespace properties (pre+post) to default json output (suppress with `.json({whitespace:false})`) | ||
- `.lookup({key:val})` with an object now returns an object back ({val: Doc}) | ||
- add nlp constructor as a third param to `.extend()` | ||
- support lexicon object param in tokenize - `.tokenize('my word', {word:'tag'})` | ||
- clean-up of scripts and tooling | ||
- improved typescript types | ||
- add support for some french contractions like `j'aime -> je aime` | ||
- allow null results in `.map()` function | ||
new things: | ||
- add new named-match syntax, with .byName() method (@Drache93) | ||
- add `nlp.fromJSON()` method | ||
- add a new `compromise-tokenize.js` build, without the tagger, or data included. | ||
##### 12.4.0 | ||
- adds proper `nlp.clone()` support (thanks @Drache93!) | ||
- better typescript support | ||
- allow longer acronyms | ||
- fix offset length issue | ||
##### 12.3.0 | ||
@@ -25,3 +61,3 @@ | ||
- verb conjugation and tagger bugfixes | ||
- disambiguate acryonyms & yelling | ||
- disambiguate between acronyms & yelling | ||
@@ -28,0 +64,0 @@ ##### 12.2.1 |
@@ -5,3 +5,3 @@ { | ||
"description": "modest natural language processing", | ||
"version": "12.3.0", | ||
"version": "13.0.0-rc1", | ||
"main": "./builds/compromise.js", | ||
@@ -17,21 +17,25 @@ "unpkg": "./builds/compromise.min.js", | ||
}, | ||
"engines": { | ||
"node": ">=6.0.0" | ||
}, | ||
"scripts": { | ||
"build": "npm run version && rollup -c && npm run filesize", | ||
"build:all": "node ./scripts/buildAll.js && npm run build", | ||
"pack": "node ./scripts/pack.js", | ||
"version": "node ./scripts/version.js", | ||
"test": "node ./scripts/testAll.js", | ||
"testb": "TESTENV=prod node ./scripts/testAll.js", | ||
"testOne": "tape \"./tests/**/*.test.js\" | tap-dancer", | ||
"build": "node ./scripts/build && rollup -c --silent && npm run test:smoke --silent", | ||
"pack": "node ./scripts/build/pack.js", | ||
"test": "node ./scripts/test/index.js ", | ||
"testb": "npm run test:smoke && TESTENV=prod node ./scripts/test/index.js", | ||
"test:smoke": "node \"./scripts/test/smoke-test/\" | tap-dancer", | ||
"test:spec": "tape \"./tests/**/*.test.js\" | tap-spec", | ||
"filesize": "node ./scripts/filesize.js", | ||
"test:perf": "node ./scripts/test/speed/index.js", | ||
"test:types": "ts-node ./scripts/test/types.ts | tap-dancer", | ||
"test:stress": "node ./scripts/test/stress.js", | ||
"coverage": "node ./scripts/test/coverage.js", | ||
"coverage:html": "nyc --reporter=html tape \"./tests/**/*.test.js\" | tap-dancer --color always", | ||
"lint": "eslint ./src/ && eslint ./plugins/**/src/", | ||
"watch": "amble ./scratch.js", | ||
"stress": "node ./scripts/stress-test/stress.js", | ||
"plugins": "node ./scripts/plugin-check.js", | ||
"speed": "node ./scripts/stress-test/speed.js", | ||
"demo": "python -m SimpleHTTPServer 8888", | ||
"coverage": "node ./scripts/coverage.js", | ||
"coverage:html": "nyc --reporter=html tape \"./tests/**/*.test.js\" | tap-dancer --color always", | ||
"test:types": "ts-node ./types/types.test.ts | tap-dancer", | ||
"lint": "eslint ./src/" | ||
"build:all": "node ./scripts/build/build-all.js && npm run build --silent", | ||
"plugins": "node ./scripts/plugin-check.js --silent", | ||
"plugins:install": "node ./scripts/plugins.js npm install", | ||
"plugins:ci": "node ./scripts/plugins.js npm ci", | ||
"plugins:build": "node ./scripts/plugins.js npm run build", | ||
"demo": "python -m SimpleHTTPServer 8888" | ||
}, | ||
@@ -53,19 +57,21 @@ "files": [ | ||
"devDependencies": { | ||
"@babel/core": "7.7.7", | ||
"@babel/preset-env": "7.7.7", | ||
"@babel/core": "7.8.3", | ||
"@babel/preset-env": "7.8.3", | ||
"@rollup/plugin-alias": "^3.0.0", | ||
"@rollup/plugin-commonjs": "^11.0.1", | ||
"@rollup/plugin-json": "^4.0.1", | ||
"@rollup/plugin-node-resolve": "^7.0.0", | ||
"amble": "0.0.7", | ||
"efrt": "2.2.1", | ||
"rollup": "1.28.0", | ||
"efrt": "2.2.2", | ||
"rollup": "1.30.1", | ||
"rollup-plugin-babel": "4.3.3", | ||
"rollup-plugin-commonjs": "10.1.0", | ||
"rollup-plugin-json": "4.0.0", | ||
"rollup-plugin-node-resolve": "5.2.0", | ||
"rollup-plugin-terser": "^5.1.3", | ||
"rollup-plugin-filesize-check": "0.0.1", | ||
"rollup-plugin-terser": "5.2.0", | ||
"shelljs": "0.8.3", | ||
"tap-dancer": "0.2.0", | ||
"tape": "4.12.1" | ||
"tape": "4.13.0" | ||
}, | ||
"eslintIgnore": [ | ||
"builds/*.js", | ||
"types/*.ts", | ||
"*.ts", | ||
"plugins/**/types/*.ts" | ||
@@ -72,0 +78,0 @@ ], |
@@ -29,2 +29,5 @@ <div align="center"> | ||
</a> | ||
<a href="https://spectrum.chat/nlp-compromise"> | ||
<img src="https://img.shields.io/badge/spectrum-chat-%23b14344" /> | ||
</a> | ||
</div> | ||
@@ -225,2 +228,15 @@ </div> | ||
or if you don't care about POS-tagging, you can use the tokenize-only build: (90kb!) | ||
```html | ||
<script src="https://unpkg.com/compromise/builds/compromise-tokenize.js"></script> | ||
<script> | ||
var doc = nlp('No, my son is also named Bort.') | ||
//you can see the text has no tags | ||
console.log(doc.has('#Noun')) //false | ||
//but the whole api still works | ||
console.log(doc.has('my .* is .? named /^b[oa]rt/')) //true | ||
</script> | ||
``` | ||
<!-- spacer --> | ||
@@ -323,3 +339,2 @@ <img height="30" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> | ||
- **[.extend()](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - mix in a compromise-plugin | ||
- **[.load()](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - re-generate a Doc object from .export() results | ||
- **[.verbose()](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - log our decision-making for debugging | ||
@@ -347,5 +362,8 @@ - **[.version()](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - current semver version of the library | ||
- **[.eq(n)](https://observablehq.com/@spencermountain/compromise-accessors)** - use only the nth result | ||
- **[.firstTerm()](https://observablehq.com/@spencermountain/compromise-accessors)** - get the first word in each match | ||
- **[.lastTerm()](https://observablehq.com/@spencermountain/compromise-accessors)** - get the end word in each match | ||
- **[.terms()](https://observablehq.com/@spencermountain/compromise-selections)** - split-up results by each individual term | ||
- **[.firstTerms()](https://observablehq.com/@spencermountain/compromise-accessors)** - get the first word in each match | ||
- **[.lastTerms()](https://observablehq.com/@spencermountain/compromise-accessors)** - get the end word in each match | ||
- **[.sentences()](https://observablehq.com/@spencermountain/compromise-accessors)** - get the whole sentence for each match | ||
- **[.termList()](https://observablehq.com/@spencermountain/compromise-accessors)** - return a flat list of all Term objects in match | ||
- **[.groups('')](https://observablehq.com/@spencermountain/compromise-accessors)** - grab any named capture-groups from a match | ||
@@ -428,7 +446,5 @@ ##### Match | ||
- **[.debug()](https://observablehq.com/@spencermountain/compromise-output)** - pretty-print the current document and its tags | ||
- **[.export()](https://observablehq.com/@spencermountain/compromise-export)** - store a parsed document for later use | ||
##### Selections | ||
- **[.terms()](https://observablehq.com/@spencermountain/compromise-selections)** - split-up results by each individual term | ||
- **[.clauses()](https://observablehq.com/@spencermountain/compromise-selections)** - split-up sentences into multi-term phrases | ||
@@ -547,2 +563,29 @@ - **[.hyphenated()](https://observablehq.com/@spencermountain/compromise-selections)** - all terms connected with a hyphen or dash like `'wash-out'` | ||
##### Export | ||
`npm install compromise-export` | ||
- **[.export()](https://observablehq.com/@spencermountain/compromise-export)** - store a parsed document for later use | ||
- **[nlp.load()](https://observablehq.com/@spencermountain/compromise-export)** - re-generate a Doc object from .export() results | ||
##### Html | ||
`npm install compromise-html` | ||
- **[.html({})](https://observablehq.com/@spencermountain/compromise-html)** - generate sanitized html from the document | ||
##### Hash | ||
`npm install compromise-hash` | ||
- **[.hash()](https://observablehq.com/@spencermountain/compromise-hash)** - generate an md5 hash from the document+tags | ||
- **[.isEqual(doc)](https://observablehq.com/@spencermountain/compromise-hash)** - compare the hash of two documents for semantic-equality | ||
##### Keypress | ||
`npm install compromise-keypress` | ||
- **[nlp.keypress('')](https://observablehq.com/@spencermountain/compromise-keypress)** - generate an md5 hash from the document+tags | ||
- **[nlp.clear('')](https://observablehq.com/@spencermountain/compromise-keypress)** - clean-up any cached sentences from memory | ||
##### Ngrams | ||
@@ -560,9 +603,2 @@ | ||
##### Output | ||
`npm install compromise-output` | ||
- **[.hash()](#)** - generate an md5 hash from the document+tags | ||
- **[.html({})]()** - generate sanitized html from the document | ||
##### Paragraphs | ||
@@ -744,6 +780,10 @@ | ||
<ul> | ||
compromise isn't easily tree-shaken. | ||
we do offer a [compromise-tokenize](./builds/compromise-tokenize.js) build, which has the POS-tagger pulled-out. | ||
<br/> | ||
but otherwise, compromise isn't easily tree-shaken. | ||
<br/> | ||
the tagging methods are competitive, and greedy, so it's not recommended to pull things out. | ||
<br/> | ||
Note that without a full POS-tagging, the contraction-parser won't work perfectly. (<i>(spencer's cool)</i> vs. <i>(spencer's house)</i>) | ||
<br/> | ||
It's recommended to run the library fully. | ||
@@ -750,0 +790,0 @@ </ul> |
export as namespace nlp | ||
// a key-value object of words, terms | ||
declare interface Lexicon { | ||
[key: string]: string | ||
} | ||
// documents indexed by a string | ||
declare interface DocIndex<W extends nlp.World = nlp.World> { | ||
[key: string]: nlp.Document<W> | ||
} | ||
declare interface nlp<D extends object, W extends object> { | ||
/** normal usage */ | ||
(text: string): nlp.ExtendedDocument<D, W> | ||
(text?: string, lexicon?: Lexicon): nlp.ExtendedDocument<D, W> | ||
/** tozenize string */ | ||
tokenize(text: string): nlp.ExtendedDocument<D, W> | ||
tokenize(text: string, lexicon?: Lexicon): nlp.ExtendedDocument<D, W> | ||
/** mix in a compromise-plugin */ | ||
@@ -17,5 +26,7 @@ extend<P>( | ||
/** re-generate a Doc object from .json() results */ | ||
load(json: any): nlp.ExtendedDocument<D, W> | ||
fromJSON(json: any): nlp.ExtendedDocument<D, W> | ||
/** log our decision-making for debugging */ | ||
verbose(bool: boolean): nlp.ExtendedDocument<D, W> | ||
verbose(bool?: boolean): nlp.ExtendedDocument<D, W> | ||
/** create instance using global world*/ | ||
clone(): nlp<D, W> | ||
/** current semver version of the library */ | ||
@@ -25,4 +36,4 @@ version: nlp.ExtendedDocument<D, W> | ||
declare function nlp(text: string): nlp.DefaultDocument | ||
declare function nlp<D extends object, W extends object>(text: string): nlp.ExtendedDocument<D, W> | ||
declare function nlp(text?: string, lexicon?: Lexicon): nlp.DefaultDocument | ||
declare function nlp<D extends object, W extends object>(text?: string): nlp.ExtendedDocument<D, W> | ||
@@ -61,5 +72,20 @@ // possible values to .json() | ||
// Cleaner plugin types | ||
type PluginWorld<D extends object, W extends object> = { | ||
// Override post process type | ||
postProcess(process: (Doc: nlp.ExtendedDocument<D, W>) => void): nlp.ExtendedWorld<W> | ||
} & nlp.ExtendedWorld<W> | ||
type PluginDocument<D extends object, W extends object> = nlp.ExtendedDocument<D, W> & { prototype: D } | ||
type PluginTerm = nlp.Term & PluginConstructor | ||
// Make these available, full support tbd | ||
type PluginConstructor = { | ||
prototype: Record<string, any> | ||
} | ||
// Constructor | ||
declare module nlp { | ||
export function tokenize(text: string): DefaultDocument | ||
export function tokenize(text?: string, lexicon?: Lexicon): DefaultDocument | ||
/** mix in a compromise-plugin */ | ||
@@ -70,5 +96,7 @@ export function extend<P>( | ||
/** re-generate a Doc object from .json() results */ | ||
export function load(json: any): DefaultDocument | ||
export function fromJSON(json: any): DefaultDocument | ||
/** log our decision-making for debugging */ | ||
export function verbose(bool: boolean): DefaultDocument | ||
export function verbose(bool?: boolean): DefaultDocument | ||
/** create instance using global world */ | ||
export function clone(): nlp<{}, {}> | ||
/** current semver version of the library */ | ||
@@ -78,4 +106,8 @@ export const version: number | ||
type Plugin<D extends object, W extends object> = ( | ||
Doc: Document<World & W> & D & { prototype: D }, | ||
world: World & W | ||
Doc: PluginDocument<D, W>, | ||
world: PluginWorld<D, W>, | ||
nlp: nlp<D, W>, | ||
Phrase: PluginConstructor, | ||
Term: PluginTerm, // @todo Add extend support | ||
Pool: PluginConstructor | ||
) => void | ||
@@ -126,7 +158,11 @@ | ||
/** get the first word in each match */ | ||
firstTerm(): Document<W> | ||
firstTerms(): Document<W> | ||
/** get the end word in each match */ | ||
lastTerm(): Document<W> | ||
lastTerms(): Document<W> | ||
/** return a flat list of all Term objects in match */ | ||
termList(): any | ||
termList(): Term[] | ||
/** grab a specific named capture group */ | ||
groups(name: string): Document<W> | ||
/** grab all named capture groups */ | ||
groups(): DocIndex<W> | ||
@@ -156,2 +192,4 @@ // Match | ||
lookup(matches: string[]): Document<W> | ||
/** quick find for an object of key-value matches */ | ||
lookup(matches: Lexicon): DocIndex<W> | ||
@@ -298,3 +336,3 @@ // Case | ||
// Subsets | ||
/** alias for .all(), until plugin overloading */ | ||
/** get the whole sentence for each match */ | ||
sentences(): Document<W> | ||
@@ -418,5 +456,173 @@ /** return things like `'Mrs.'`*/ | ||
class World {} | ||
class World { | ||
/** more logs for debugging */ | ||
verbose(on?: boolean): this | ||
isVerbose(): boolean | ||
/** get all terms in our lexicon with this tag */ | ||
getByTag(tag: string): Record<string, true> | ||
/** put new words into our lexicon, properly */ | ||
addWords(words: Record<string, string>): void | ||
/** extend the compromise tagset */ | ||
addTags( | ||
tags: Record< | ||
string, | ||
{ | ||
isA?: string | string[] | ||
notA?: string | string[] | ||
} | ||
> | ||
): void | ||
/** call methods after tagger runs */ | ||
postProcess<D extends Document = Document>(process: (Doc: D) => void): this | ||
} | ||
// @todo | ||
interface RegSyntax { | ||
[index: string]: any | ||
} | ||
type TextOutOptions = | ||
| 'reduced' | ||
| 'root' | ||
| 'implicit' | ||
| 'normal' | ||
| 'unicode' | ||
| 'titlecase' | ||
| 'lowercase' | ||
| 'acronyms' | ||
| 'whitespace' | ||
| 'punctuation' | ||
| 'abbreviations' | ||
type JsonOutOptions = 'text' | 'normal' | 'tags' | 'clean' | 'id' | 'offset' | 'implicit' | 'whitespace' | 'bestTag' | ||
class Term { | ||
isA: 'Term' // Get Type | ||
id: string | ||
// main data | ||
text: string | ||
tags: Record<string, boolean> | ||
// alternative forms of this.text | ||
root: string | null | ||
implicit: string | null | ||
clean?: string | ||
reduced?: string | ||
// additional surrounding information | ||
prev: string | null // id of prev term | ||
next: string | null // id of next term | ||
pre?: string // character before e.g. ' ' ',' | ||
post?: string // character after e.g. ' ' ',' | ||
// support alternative matches | ||
alias?: string | ||
constructor(text?: string) | ||
set(text: string): this | ||
/** clone contents to new term */ | ||
clone(): Term | ||
/** convert all text to uppercase */ | ||
toUpperCase(): this | ||
/** convert all text to lowercase */ | ||
toLowerCase(): this | ||
/** only set the first letter to uppercase | ||
* leave any existing uppercase alone | ||
*/ | ||
toTitleCase(): this | ||
/** if all letters are uppercase */ | ||
isUpperCase(): this | ||
/** if the first letter is uppercase, and the rest are lowercase */ | ||
isTitleCase(): this | ||
titleCase(): this | ||
/** search the term's 'post' punctuation */ | ||
hasPost(): boolean | ||
/** search the term's 'pre' punctuation */ | ||
hasPre(): boolean | ||
/** does it have a quotation symbol? */ | ||
hasQuote(): boolean | ||
hasQuotation(): boolean | ||
/** does it have a comma? */ | ||
hasComma(): boolean | ||
/** does it end in a period? */ | ||
hasPeriod(): boolean | ||
/** does it end in an exclamation */ | ||
hasExclamation(): boolean | ||
/** does it end with a question mark? */ | ||
hasQuestionMark(): boolean | ||
/** is there a ... at the end? */ | ||
hasEllipses(): boolean | ||
/** is there a semicolon after this word? */ | ||
hasSemicolon(): boolean | ||
/** is there a slash '/' in this word? */ | ||
hasSlash(): boolean | ||
/** a hyphen connects two words like-this */ | ||
hasHyphen(): boolean | ||
/** a dash separates words - like that */ | ||
hasDash(): boolean | ||
/** is it multiple words combinded */ | ||
hasContraction(): boolean | ||
/** try to sensibly put this punctuation mark into the term */ | ||
addPunctuation(punct: string): this | ||
doesMatch(reg: RegSyntax, index: number, length: number): boolean | ||
/** does this term look like an acronym? */ | ||
isAcronym(): boolean | ||
/** is this term implied by a contraction? */ | ||
isImplicit(): boolean | ||
/** does the term have at least one good tag? */ | ||
isKnown(): boolean | ||
/** cache the root property of the term */ | ||
setRoot(world: World): void | ||
/** return various text formats of this term */ | ||
textOut(options?: Record<TextOutOptions, boolean>, showPre?: boolean, showPost?: boolean): string | ||
/** return various metadata for this term */ | ||
// @todo create output type from options... | ||
json(options?: Record<JsonOutOptions, boolean>, world?: World): object | ||
/** add a tag or tags, and their descendents to this term */ | ||
tag(tags: string | string[], reason?: string, world?: World): this | ||
/** only tag this term if it's consistent with it's current tags */ | ||
tagSafe(tags: string | string[], reason?: string, world?: World): this | ||
/** remove a tag or tags, and their descendents from this term */ | ||
unTag(tags: string | string[], reason?: string, world?: World): this | ||
/** is this tag consistent with the word's current tags? */ | ||
canBe(tags: string | string[], world?: World): boolean | ||
} | ||
} | ||
export default nlp |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
No v1
QualityPackage is not semver >=1. This means it is not stable and does not support ^ ranges.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
1744889
10
21817
805
15
2