Socket
Socket
Sign inDemoInstall

compromise

Package Overview
Dependencies
Maintainers
2
Versions
169
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

compromise - npm Package Compare versions

Comparing version 12.3.0 to 13.0.0-rc1

builds/compromise-tokenize.js

38

changelog.md

@@ -11,2 +11,38 @@ compromise uses semver, and pushes to npm frequently

##### 13.0.0
_major changes to `.export()` and `[capture] group` match-syntax._
- **[breaking]** move .export() and .load() methods to plugin (compromise-export)
- - change .export() format - this hasn't worked properly since v12. (mis-parsed contractions) see #669
- **[breaking]** split `compromise-output` into `compromise-html` and `compromise-hash` plugins
- **[breaking]** `.match('foo [bar]')` no-longer returns 'bar'. (use `.match('foo [bar]', 0)`)
- **[breaking]** change `.sentences()` method to return only full-sentences of matches (use `.all()` instead)
modifications:
- fix nlp.clone() - hasn't worked properly, since v12. (@Drache93)
- fix issues with greedy capture [*] and [.+] -(@Drache93) 💛
- add whitespace properties (pre+post) to default json output (suppress with `.json({whitespace:false})`)
- `.lookup({key:val})` with an object now returns an object back ({val: Doc})
- add nlp constructor as a third param to `.extend()`
- support lexicon object param in tokenize - `.tokenize('my word', {word:'tag'})`
- clean-up of scripts and tooling
- improved typescript types
- add support for some french contractions like `j'aime -> je aime`
- allow null results in `.map()` function
new things:
- add new named-match syntax, with .byName() method (@Drache93)
- add `nlp.fromJSON()` method
- add a new `compromise-tokenize.js` build, without the tagger, or data included.
##### 12.4.0
- adds proper `nlp.clone()` support (thanks @Drache93!)
- better typescript support
- allow longer acronyms
- fix offset length issue
##### 12.3.0

@@ -25,3 +61,3 @@

- verb conjugation and tagger bugfixes
- disambiguate acryonyms & yelling
- disambiguate between acronyms & yelling

@@ -28,0 +64,0 @@ ##### 12.2.1

60

package.json

@@ -5,3 +5,3 @@ {

"description": "modest natural language processing",
"version": "12.3.0",
"version": "13.0.0-rc1",
"main": "./builds/compromise.js",

@@ -17,21 +17,25 @@ "unpkg": "./builds/compromise.min.js",

},
"engines": {
"node": ">=6.0.0"
},
"scripts": {
"build": "npm run version && rollup -c && npm run filesize",
"build:all": "node ./scripts/buildAll.js && npm run build",
"pack": "node ./scripts/pack.js",
"version": "node ./scripts/version.js",
"test": "node ./scripts/testAll.js",
"testb": "TESTENV=prod node ./scripts/testAll.js",
"testOne": "tape \"./tests/**/*.test.js\" | tap-dancer",
"build": "node ./scripts/build && rollup -c --silent && npm run test:smoke --silent",
"pack": "node ./scripts/build/pack.js",
"test": "node ./scripts/test/index.js ",
"testb": "npm run test:smoke && TESTENV=prod node ./scripts/test/index.js",
"test:smoke": "node \"./scripts/test/smoke-test/\" | tap-dancer",
"test:spec": "tape \"./tests/**/*.test.js\" | tap-spec",
"filesize": "node ./scripts/filesize.js",
"test:perf": "node ./scripts/test/speed/index.js",
"test:types": "ts-node ./scripts/test/types.ts | tap-dancer",
"test:stress": "node ./scripts/test/stress.js",
"coverage": "node ./scripts/test/coverage.js",
"coverage:html": "nyc --reporter=html tape \"./tests/**/*.test.js\" | tap-dancer --color always",
"lint": "eslint ./src/ && eslint ./plugins/**/src/",
"watch": "amble ./scratch.js",
"stress": "node ./scripts/stress-test/stress.js",
"plugins": "node ./scripts/plugin-check.js",
"speed": "node ./scripts/stress-test/speed.js",
"demo": "python -m SimpleHTTPServer 8888",
"coverage": "node ./scripts/coverage.js",
"coverage:html": "nyc --reporter=html tape \"./tests/**/*.test.js\" | tap-dancer --color always",
"test:types": "ts-node ./types/types.test.ts | tap-dancer",
"lint": "eslint ./src/"
"build:all": "node ./scripts/build/build-all.js && npm run build --silent",
"plugins": "node ./scripts/plugin-check.js --silent",
"plugins:install": "node ./scripts/plugins.js npm install",
"plugins:ci": "node ./scripts/plugins.js npm ci",
"plugins:build": "node ./scripts/plugins.js npm run build",
"demo": "python -m SimpleHTTPServer 8888"
},

@@ -53,19 +57,21 @@ "files": [

"devDependencies": {
"@babel/core": "7.7.7",
"@babel/preset-env": "7.7.7",
"@babel/core": "7.8.3",
"@babel/preset-env": "7.8.3",
"@rollup/plugin-alias": "^3.0.0",
"@rollup/plugin-commonjs": "^11.0.1",
"@rollup/plugin-json": "^4.0.1",
"@rollup/plugin-node-resolve": "^7.0.0",
"amble": "0.0.7",
"efrt": "2.2.1",
"rollup": "1.28.0",
"efrt": "2.2.2",
"rollup": "1.30.1",
"rollup-plugin-babel": "4.3.3",
"rollup-plugin-commonjs": "10.1.0",
"rollup-plugin-json": "4.0.0",
"rollup-plugin-node-resolve": "5.2.0",
"rollup-plugin-terser": "^5.1.3",
"rollup-plugin-filesize-check": "0.0.1",
"rollup-plugin-terser": "5.2.0",
"shelljs": "0.8.3",
"tap-dancer": "0.2.0",
"tape": "4.12.1"
"tape": "4.13.0"
},
"eslintIgnore": [
"builds/*.js",
"types/*.ts",
"*.ts",
"plugins/**/types/*.ts"

@@ -72,0 +78,0 @@ ],

@@ -29,2 +29,5 @@ <div align="center">

</a>
<a href="https://spectrum.chat/nlp-compromise">
<img src="https://img.shields.io/badge/spectrum-chat-%23b14344" />
</a>
</div>

@@ -225,2 +228,15 @@ </div>

or if you don't care about POS-tagging, you can use the tokenize-only build: (90kb!)
```html
<script src="https://unpkg.com/compromise/builds/compromise-tokenize.js"></script>
<script>
var doc = nlp('No, my son is also named Bort.')
//you can see the text has no tags
console.log(doc.has('#Noun')) //false
//but the whole api still works
console.log(doc.has('my .* is .? named /^b[oa]rt/')) //true
</script>
```
<!-- spacer -->

@@ -323,3 +339,2 @@ <img height="30" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>

- **[.extend()](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - mix in a compromise-plugin
- **[.load()](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - re-generate a Doc object from .export() results
- **[.verbose()](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - log our decision-making for debugging

@@ -347,5 +362,8 @@ - **[.version()](https://observablehq.com/@spencermountain/compromise-constructor-methods)** - current semver version of the library

- **[.eq(n)](https://observablehq.com/@spencermountain/compromise-accessors)** - use only the nth result
- **[.firstTerm()](https://observablehq.com/@spencermountain/compromise-accessors)** - get the first word in each match
- **[.lastTerm()](https://observablehq.com/@spencermountain/compromise-accessors)** - get the end word in each match
- **[.terms()](https://observablehq.com/@spencermountain/compromise-selections)** - split-up results by each individual term
- **[.firstTerms()](https://observablehq.com/@spencermountain/compromise-accessors)** - get the first word in each match
- **[.lastTerms()](https://observablehq.com/@spencermountain/compromise-accessors)** - get the end word in each match
- **[.sentences()](https://observablehq.com/@spencermountain/compromise-accessors)** - get the whole sentence for each match
- **[.termList()](https://observablehq.com/@spencermountain/compromise-accessors)** - return a flat list of all Term objects in match
- **[.groups('')](https://observablehq.com/@spencermountain/compromise-accessors)** - grab any named capture-groups from a match

@@ -428,7 +446,5 @@ ##### Match

- **[.debug()](https://observablehq.com/@spencermountain/compromise-output)** - pretty-print the current document and its tags
- **[.export()](https://observablehq.com/@spencermountain/compromise-export)** - store a parsed document for later use
##### Selections
- **[.terms()](https://observablehq.com/@spencermountain/compromise-selections)** - split-up results by each individual term
- **[.clauses()](https://observablehq.com/@spencermountain/compromise-selections)** - split-up sentences into multi-term phrases

@@ -547,2 +563,29 @@ - **[.hyphenated()](https://observablehq.com/@spencermountain/compromise-selections)** - all terms connected with a hyphen or dash like `'wash-out'`

##### Export
`npm install compromise-export`
- **[.export()](https://observablehq.com/@spencermountain/compromise-export)** - store a parsed document for later use
- **[nlp.load()](https://observablehq.com/@spencermountain/compromise-export)** - re-generate a Doc object from .export() results
##### Html
`npm install compromise-html`
- **[.html({})](https://observablehq.com/@spencermountain/compromise-html)** - generate sanitized html from the document
##### Hash
`npm install compromise-hash`
- **[.hash()](https://observablehq.com/@spencermountain/compromise-hash)** - generate an md5 hash from the document+tags
- **[.isEqual(doc)](https://observablehq.com/@spencermountain/compromise-hash)** - compare the hash of two documents for semantic-equality
##### Keypress
`npm install compromise-keypress`
- **[nlp.keypress('')](https://observablehq.com/@spencermountain/compromise-keypress)** - generate an md5 hash from the document+tags
- **[nlp.clear('')](https://observablehq.com/@spencermountain/compromise-keypress)** - clean-up any cached sentences from memory
##### Ngrams

@@ -560,9 +603,2 @@

##### Output
`npm install compromise-output`
- **[.hash()](#)** - generate an md5 hash from the document+tags
- **[.html({})]()** - generate sanitized html from the document
##### Paragraphs

@@ -744,6 +780,10 @@

<ul>
compromise isn't easily tree-shaken.
we do offer a [compromise-tokenize](./builds/compromise-tokenize.js) build, which has the POS-tagger pulled-out.
<br/>
but otherwise, compromise isn't easily tree-shaken.
<br/>
the tagging methods are competitive, and greedy, so it's not recommended to pull things out.
<br/>
Note that without a full POS-tagging, the contraction-parser won't work perfectly. (<i>(spencer's cool)</i> vs. <i>(spencer's house)</i>)
<br/>
It's recommended to run the library fully.

@@ -750,0 +790,0 @@ </ul>

export as namespace nlp
// a key-value object of words, terms
declare interface Lexicon {
[key: string]: string
}
// documents indexed by a string
declare interface DocIndex<W extends nlp.World = nlp.World> {
[key: string]: nlp.Document<W>
}
declare interface nlp<D extends object, W extends object> {
/** normal usage */
(text: string): nlp.ExtendedDocument<D, W>
(text?: string, lexicon?: Lexicon): nlp.ExtendedDocument<D, W>
/** tozenize string */
tokenize(text: string): nlp.ExtendedDocument<D, W>
tokenize(text: string, lexicon?: Lexicon): nlp.ExtendedDocument<D, W>
/** mix in a compromise-plugin */

@@ -17,5 +26,7 @@ extend<P>(

/** re-generate a Doc object from .json() results */
load(json: any): nlp.ExtendedDocument<D, W>
fromJSON(json: any): nlp.ExtendedDocument<D, W>
/** log our decision-making for debugging */
verbose(bool: boolean): nlp.ExtendedDocument<D, W>
verbose(bool?: boolean): nlp.ExtendedDocument<D, W>
/** create instance using global world*/
clone(): nlp<D, W>
/** current semver version of the library */

@@ -25,4 +36,4 @@ version: nlp.ExtendedDocument<D, W>

declare function nlp(text: string): nlp.DefaultDocument
declare function nlp<D extends object, W extends object>(text: string): nlp.ExtendedDocument<D, W>
declare function nlp(text?: string, lexicon?: Lexicon): nlp.DefaultDocument
declare function nlp<D extends object, W extends object>(text?: string): nlp.ExtendedDocument<D, W>

@@ -61,5 +72,20 @@ // possible values to .json()

// Cleaner plugin types
type PluginWorld<D extends object, W extends object> = {
// Override post process type
postProcess(process: (Doc: nlp.ExtendedDocument<D, W>) => void): nlp.ExtendedWorld<W>
} & nlp.ExtendedWorld<W>
type PluginDocument<D extends object, W extends object> = nlp.ExtendedDocument<D, W> & { prototype: D }
type PluginTerm = nlp.Term & PluginConstructor
// Make these available, full support tbd
type PluginConstructor = {
prototype: Record<string, any>
}
// Constructor
declare module nlp {
export function tokenize(text: string): DefaultDocument
export function tokenize(text?: string, lexicon?: Lexicon): DefaultDocument
/** mix in a compromise-plugin */

@@ -70,5 +96,7 @@ export function extend<P>(

/** re-generate a Doc object from .json() results */
export function load(json: any): DefaultDocument
export function fromJSON(json: any): DefaultDocument
/** log our decision-making for debugging */
export function verbose(bool: boolean): DefaultDocument
export function verbose(bool?: boolean): DefaultDocument
/** create instance using global world */
export function clone(): nlp<{}, {}>
/** current semver version of the library */

@@ -78,4 +106,8 @@ export const version: number

type Plugin<D extends object, W extends object> = (
Doc: Document<World & W> & D & { prototype: D },
world: World & W
Doc: PluginDocument<D, W>,
world: PluginWorld<D, W>,
nlp: nlp<D, W>,
Phrase: PluginConstructor,
Term: PluginTerm, // @todo Add extend support
Pool: PluginConstructor
) => void

@@ -126,7 +158,11 @@

/** get the first word in each match */
firstTerm(): Document<W>
firstTerms(): Document<W>
/** get the end word in each match */
lastTerm(): Document<W>
lastTerms(): Document<W>
/** return a flat list of all Term objects in match */
termList(): any
termList(): Term[]
/** grab a specific named capture group */
groups(name: string): Document<W>
/** grab all named capture groups */
groups(): DocIndex<W>

@@ -156,2 +192,4 @@ // Match

lookup(matches: string[]): Document<W>
/** quick find for an object of key-value matches */
lookup(matches: Lexicon): DocIndex<W>

@@ -298,3 +336,3 @@ // Case

// Subsets
/** alias for .all(), until plugin overloading */
/** get the whole sentence for each match */
sentences(): Document<W>

@@ -418,5 +456,173 @@ /** return things like `'Mrs.'`*/

class World {}
class World {
/** more logs for debugging */
verbose(on?: boolean): this
isVerbose(): boolean
/** get all terms in our lexicon with this tag */
getByTag(tag: string): Record<string, true>
/** put new words into our lexicon, properly */
addWords(words: Record<string, string>): void
/** extend the compromise tagset */
addTags(
tags: Record<
string,
{
isA?: string | string[]
notA?: string | string[]
}
>
): void
/** call methods after tagger runs */
postProcess<D extends Document = Document>(process: (Doc: D) => void): this
}
// @todo
interface RegSyntax {
[index: string]: any
}
type TextOutOptions =
| 'reduced'
| 'root'
| 'implicit'
| 'normal'
| 'unicode'
| 'titlecase'
| 'lowercase'
| 'acronyms'
| 'whitespace'
| 'punctuation'
| 'abbreviations'
type JsonOutOptions = 'text' | 'normal' | 'tags' | 'clean' | 'id' | 'offset' | 'implicit' | 'whitespace' | 'bestTag'
class Term {
isA: 'Term' // Get Type
id: string
// main data
text: string
tags: Record<string, boolean>
// alternative forms of this.text
root: string | null
implicit: string | null
clean?: string
reduced?: string
// additional surrounding information
prev: string | null // id of prev term
next: string | null // id of next term
pre?: string // character before e.g. ' ' ','
post?: string // character after e.g. ' ' ','
// support alternative matches
alias?: string
constructor(text?: string)
set(text: string): this
/** clone contents to new term */
clone(): Term
/** convert all text to uppercase */
toUpperCase(): this
/** convert all text to lowercase */
toLowerCase(): this
/** only set the first letter to uppercase
* leave any existing uppercase alone
*/
toTitleCase(): this
/** if all letters are uppercase */
isUpperCase(): this
/** if the first letter is uppercase, and the rest are lowercase */
isTitleCase(): this
titleCase(): this
/** search the term's 'post' punctuation */
hasPost(): boolean
/** search the term's 'pre' punctuation */
hasPre(): boolean
/** does it have a quotation symbol? */
hasQuote(): boolean
hasQuotation(): boolean
/** does it have a comma? */
hasComma(): boolean
/** does it end in a period? */
hasPeriod(): boolean
/** does it end in an exclamation */
hasExclamation(): boolean
/** does it end with a question mark? */
hasQuestionMark(): boolean
/** is there a ... at the end? */
hasEllipses(): boolean
/** is there a semicolon after this word? */
hasSemicolon(): boolean
/** is there a slash '/' in this word? */
hasSlash(): boolean
/** a hyphen connects two words like-this */
hasHyphen(): boolean
/** a dash separates words - like that */
hasDash(): boolean
/** is it multiple words combinded */
hasContraction(): boolean
/** try to sensibly put this punctuation mark into the term */
addPunctuation(punct: string): this
doesMatch(reg: RegSyntax, index: number, length: number): boolean
/** does this term look like an acronym? */
isAcronym(): boolean
/** is this term implied by a contraction? */
isImplicit(): boolean
/** does the term have at least one good tag? */
isKnown(): boolean
/** cache the root property of the term */
setRoot(world: World): void
/** return various text formats of this term */
textOut(options?: Record<TextOutOptions, boolean>, showPre?: boolean, showPost?: boolean): string
/** return various metadata for this term */
// @todo create output type from options...
json(options?: Record<JsonOutOptions, boolean>, world?: World): object
/** add a tag or tags, and their descendents to this term */
tag(tags: string | string[], reason?: string, world?: World): this
/** only tag this term if it's consistent with it's current tags */
tagSafe(tags: string | string[], reason?: string, world?: World): this
/** remove a tag or tags, and their descendents from this term */
unTag(tags: string | string[], reason?: string, world?: World): this
/** is this tag consistent with the word's current tags? */
canBe(tags: string | string[], world?: World): boolean
}
}
export default nlp

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc