Socket
Socket
Sign inDemoInstall

wink-nlp

Package Overview
Dependencies
Maintainers
1
Versions
40
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

wink-nlp - npm Package Compare versions

Comparing version 1.14.3 to 2.0.0

.github/workflows/coveralls.yml

2

.nyc_output/processinfo/index.json

@@ -1,1 +0,1 @@

{"processes":{"1b7043cb-6519-437a-bb44-eec3016432dc":{"parent":null,"children":[]}},"files":{"/Users/neilsbohr/dev/winkjs/wink-nlp/src/wink-nlp.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/dd-wrapper.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/constants.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/doc-v2.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/contained-entities.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/locate.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/get-parent-item.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/search.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-get-item.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-get-item.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-each.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-each.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-filter.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-filter.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-token-out.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/its.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/sort4FT.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/sentence-wise-importance.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/allowed.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/as.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/contained-markings.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-tokens-out.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-tokens-out.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-entity-out.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-entities-out.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-entities-out.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-sentence-out.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-sentences-out.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-document-out.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/print-tokens.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/cache.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/tokenizer.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/recursive-tokenizer.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/compile-trex.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/tokens-mappers.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/examples-compiler.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/automaton.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/compose-patterns.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/identify-marked-area.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/helper.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/utilities/bm25-vectorizer.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/utilities/allowed.js":["1b7043cb-6519-437a-bb44-eec3016432dc"],"/Users/neilsbohr/dev/winkjs/wink-nlp/utilities/similarity.js":["1b7043cb-6519-437a-bb44-eec3016432dc"]},"externalIds":{}}
{"processes":{"c672e6ee-7ff5-4bc6-83ca-ae67e007fff0":{"parent":null,"children":[]}},"files":{"/Users/neilsbohr/dev/winkjs/wink-nlp/src/wink-nlp.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/dd-wrapper.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/constants.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/doc-v2.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/contained-entities.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/locate.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/get-parent-item.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/search.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-get-item.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-get-item.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-each.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-each.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-filter.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-filter.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-token-out.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/its.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/sort4FT.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/sentence-wise-importance.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/allowed.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/as.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/contained-markings.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-tokens-out.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-tokens-out.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-entity-out.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-entities-out.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-entities-out.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-sentence-out.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-sentences-out.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-document-out.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/print-tokens.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/cache.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/tokenizer.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/recursive-tokenizer.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/compile-trex.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/tokens-mappers.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/examples-compiler.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/automaton.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/compose-patterns.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/identify-marked-area.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/helper.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/utilities/bm25-vectorizer.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/utilities/allowed.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"],"/Users/neilsbohr/dev/winkjs/wink-nlp/utilities/similarity.js":["c672e6ee-7ff5-4bc6-83ca-ae67e007fff0"]},"externalIds":{}}

@@ -0,1 +1,14 @@

# [Word embeddings have arrived!](https://github.com/winkjs/wink-nlp/releases/tag/2.0.0)
## Version 2.0.0 March 24, 2024
### ✨ Features
- Seamless word embedding integration enhances winkNLP's semantic capabilities. 🎉 👏 🙌
- Pre-trained 100-dimensional word embeddings for over 350,000 English words released: [wink-embeddings-sg-100d](https://github.com/winkjs/wink-embeddings-sg-100d). 💯
- API remains unchanged — no code updates needed for existing projects. The new APIs include: 🤩
- **Obtain vector for a token:** Use the `.vectorOf( token )` API.
- **Compute sentence/document embeddings:** Employ the `as.vector` helper: use `.out( its.lemma, as.vector )` on tokens of a sentence or document. You can also use `its.value` or `its.normal`. Tokens can be pre-processed to remove stop words etc using the `.filter()` API. Note, the `as.vector` helper uses averaging technique.
- **Generate contextual vectors:** Leverage the `.contextualVectors()` method on a document. Useful for pure browser-side applications! Generate custom vectors contextually relevant to your corpus and use them in place of larger pre-trained wink embeddings.
- Comprehensive documentation along with interesting examples is coming up shortly. Stay tuned for updates! 😎
# [Added Deno example](https://github.com/winkjs/wink-nlp/releases/tag/1.14.3)

@@ -2,0 +15,0 @@ ## Version 1.14.3 July 21, 2023

{
"name": "wink-nlp",
"version": "1.14.3",
"version": "2.0.0",
"description": "Developer friendly Natural Language Processing ✨",

@@ -23,2 +23,4 @@ "keywords": [

"vectorizer",
"Embeddings",
"Word Vectors",
"winkNLP",

@@ -31,3 +33,3 @@ "winkjs",

"pretest": "npm run lint",
"test": "nyc --reporter=html --reporter=text mocha ./test/",
"test": "nyc --reporter=html --reporter=lcov --reporter=text mocha ./test/",
"coverage": "nyc report --reporter=text-lcov | coveralls",

@@ -34,0 +36,0 @@ "sourcedocs": "docker -i src -o ./sourcedocs --sidebar yes",

# winkNLP
### [![Build Status](https://travis-ci.com/winkjs/wink-nlp.svg?branch=master)](https://travis-ci.com/github/winkjs/wink-nlp) [![Coverage Status](https://coveralls.io/repos/github/winkjs/wink-nlp/badge.svg?branch=master)](https://coveralls.io/github/winkjs/wink-nlp?branch=master) [![Known Vulnerabilities](https://snyk.io/test/github/winkjs/wink-nlp/badge.svg)](https://snyk.io/test/github/winkjs/wink-nlp) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/6035/badge)](https://bestpractices.coreinfrastructure.org/projects/6035) [![Gitter](https://img.shields.io/gitter/room/nwjs/nw.js.svg)](https://gitter.im/winkjs/Lobby) [![Follow on Twitter](https://img.shields.io/twitter/follow/winkjs_org?style=social)](https://twitter.com/winkjs_org)
### [![Build Status](https://github.com/winkjs/wink-nlp/actions/workflows/node.js.yml/badge.svg)](https://github.com/winkjs/wink-nlp/actions/workflows/node.js.yml/) [![Coverage Status](https://coveralls.io/repos/github/winkjs/wink-nlp/badge.svg?branch=master)](https://coveralls.io/github/winkjs/wink-nlp?branch=master) [![Known Vulnerabilities](https://snyk.io/test/github/winkjs/wink-nlp/badge.svg)](https://snyk.io/test/github/winkjs/wink-nlp) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/6035/badge)](https://bestpractices.coreinfrastructure.org/projects/6035) [![Gitter](https://img.shields.io/gitter/room/nwjs/nw.js.svg)](https://gitter.im/winkjs/Lobby) [![Follow on Twitter](https://img.shields.io/twitter/follow/winkjs_org?style=social)](https://twitter.com/winkjs_org)

@@ -5,0 +5,0 @@ ## Developer friendly Natural Language Processing ✨

@@ -68,3 +68,4 @@ // wink-nlp

as.unique,
as.markedUpText
as.markedUpText,
as.vector
] );

@@ -80,3 +81,4 @@

as.bigrams,
as.unique
as.unique,
as.vector
] );

@@ -83,0 +85,0 @@

@@ -56,7 +56,2 @@ // wink-nlp

var colTokensOut = function ( start, end, rdd, itsf, asf, addons ) {
// Vectors require completely different handling.
if ( itsf === its.vector ) {
return its.vector( start, end, rdd.tokens, addons );
}
// Not a vector request, perform map-reduce.

@@ -66,2 +61,7 @@ var mappedTkns = [];

var asfn = ( asf && allowed.as4tokens.has( asf ) ) ? asf : as.array;
if ( itsfn !== its.value && itsfn !== its.normal && itsfn !== its.lemma && asfn === as.vector ) {
throw Error( 'winkNLP: as.vector is allowed only with its value or normal or lemma.' );
}
// Note, `as.text/markedUpText` needs special attention to include preceeding spaces.

@@ -78,5 +78,5 @@ if ( asfn === as.text || asfn === as.markedUpText ) {

return asfn( mappedTkns, rdd.markings, start, end );
return asfn( mappedTkns, rdd, start, end );
}; // colTokensOut()
module.exports = colTokensOut;

@@ -50,6 +50,2 @@ // wink-nlp

var document = rdd.document;
// Vectors require completely different handling.
if ( itsf === its.vector ) {
return its.vector( document, rdd, addons );
}

@@ -56,0 +52,0 @@ var itsfn = ( itsf && allowed.its4document.has( itsf ) ) ? itsf : its.value;

@@ -51,6 +51,2 @@ // wink-nlp

var sentence = rdd.sentences[ index ];
// Vectors require completely different handling.
if ( itsf === its.vector ) {
return its.vector( sentence, rdd, addons );
}

@@ -57,0 +53,0 @@ var itsfn = ( itsf && allowed.its4sentence.has( itsf ) ) ? itsf : its.value;

@@ -48,6 +48,2 @@ // wink-nlp

var itmTokenOut = function ( index, rdd, itsf, addons ) {
// Vectors require completely different handling.
if ( itsf === its.vector ) {
return its.vector( index, rdd, addons );
}
// Not a vector request, map using `itsf`.

@@ -54,0 +50,0 @@ var f = ( allowed.its4token.has( itsf ) ) ? itsf : its.value;

@@ -55,7 +55,2 @@ // wink-nlp

var selTokensOut = function ( selTokens, rdd, itsf, asf, addons ) {
// Vectors require completely different handling.
if ( itsf === its.vector ) {
return its.vector( selTokens, rdd.tokens, addons );
}
// Not a vector request, perform map-reduce.

@@ -66,2 +61,6 @@ var mappedTkns = [];

if ( itsfn !== its.value && itsfn !== its.normal && itsfn !== its.lemma && asfn === as.vector ) {
throw Error( 'winkNLP: as.vector is allowed only with its value or normal or lemma.' );
}
// Note, `as.text` needs special attention to include preceeding spaces.

@@ -79,5 +78,5 @@ // No `markedUpText` allowed here.

return asfn( mappedTkns );
return asfn( mappedTkns, rdd );
}; // selTokensOut()
module.exports = selTokensOut;

@@ -155,3 +155,3 @@ // wink-nlp

* @param {array} twps Array containing tokens with preceding spaces.
* @param {array} markings Array containing span of markings & marking specs.
* @param {object} rdd Raw Document Data structure.
* @param {number} start The start index of the tokens.

@@ -162,3 +162,5 @@ * @param {number} end The end index of the tokens.

*/
as.markedUpText = function ( twps, markings, start, end ) {
as.markedUpText = function ( twps, rdd, start, end ) {
// Extract markings.
const markings = rdd.markings;
// Offset to be added while computing `first` and `last` indexes of `twps`.

@@ -188,2 +190,52 @@ var offset = start * 2;

as.vector = function ( tokens, rdd ) {
if ( !rdd.wordVectors )
throw Error( 'wink-nlp: word vectors are not loaded, use const nlp = winkNLP( model, pipe, wordVectors ) to load.' );
// Get size of a vector from word vectors
const size = rdd.wordVectors.dimensions;
const precision = rdd.wordVectors.precision;
const vectors = rdd.wordVectors.vectors;
const l2NormIndex = rdd.wordVectors.l2NormIndex;
// Set up a new initialized vector of `size`
const v = new Array( size );
v.fill( 0 );
// Compute average.
// We will count the number of tokens as some of them may not have a vector.
let numOfTokens = 0;
for ( let i = 0; i < tokens.length; i += 1 ) {
// Extract token vector for the current token.
const tv = vectors[ tokens[ i ].toLowerCase() ];
// Increment `numOfTokens` if the above operation was successful
// AND l2Norm is non-zero, because for UNK vectors it is set to 0.
// The later is applicable for the contextual vectors, where in event
// of UNK, an all zero vectors is set for UNK word.
if ( tv !== undefined && tv[ l2NormIndex ] !== 0 ) numOfTokens += 1;
for ( let j = 0; j < size; j += 1 ) {
// Keep summing, eventually it will be divided by `numOfTokens` to obtain avareage.
v[ j ] += ( tv === undefined ) ? 0 : tv[ j ];
}
}
// if no token's vector is found, return a 0-vector!
if ( numOfTokens === 0 ) {
// Push l2Norm, which is 0 in this case.
v.push( 0 );
return v;
}
// Non-0 vector, find average by dividing the sum by numOfTokens
// also compute l2Norm.
let l2Norm = 0;
for ( let i = 0; i < size; i += 1 ) {
v[ i ] = +( v[ i ] / numOfTokens ).toFixed( precision );
l2Norm += v[ i ] * v[ i ];
}
// `l2Norm` becomes the `size+1th` element for faster cosine similarity/normalization.
v.push( +( Math.sqrt( l2Norm ).toFixed( precision ) ) );
return v;
}; // vector()
module.exports = as;

@@ -73,2 +73,4 @@ // wink-nlp

var its = require( './its.js' );
// <hr/>

@@ -133,2 +135,5 @@

// Vectors API
var contextualVectors;
// Others.

@@ -449,3 +454,116 @@ var isLexeme = cache.lookup;

// ### contextualVectors
/**
*
* Makes a JSON of contextually relevant words in the winkNLP format.
*
* @return {string} containing the JSON.
*/
// eslint-disable-next-line complexity
contextualVectors = function ( { lemma = true, specificWordVectors = [], similarWordVectors = false, wordVectorsLimit = 0 } = {} ) {
// Error handling!
if ( !Array.isArray( specificWordVectors ) )
throw Error( `wink-nlp: expecting a valid Javascript array for similarWordVectos, instead found "${typeof specificWordVectors}".`);
if ( !Number.isInteger( wordVectorsLimit ) || wordVectorsLimit >= docData.wordVectors.size )
throw Error( 'wink-nlp: invalid value or type encountered for wordVectorsLimit.' );
// Initialize contextual vectors.
const cv = Object.create( null );
// Following properties are constants, therefore can be directly copied.
cv.precision = docData.wordVectors.precision;
cv.l2NormIndex = docData.wordVectors.l2NormIndex;
cv.wordIndex = docData.wordVectors.wordIndex;
cv.dimensions = docData.wordVectors.dimensions;
cv.unkVector = docData.wordVectors.unkVector.slice( 0 );
// Following properties will be determined on the basis of the context.
cv.size = 0;
cv.words = [];
cv.vectors = Object.create( null );
// Shortcut all word vectors.
const awvs = docData.wordVectors.vectors;
// Extract all document's tokens.
const docTokens = colTokens( 0, docData.numOfTokens - 1 )()
.out()
.map( ( t ) => t.toLowerCase() );
let docTokensLemma = [];
if ( lemma ) docTokensLemma = colTokens( 0, docData.numOfTokens - 1 )()
.out( its.lemma )
.map( ( t ) => t.toLowerCase() );
// NOTE: For UNK words an all zero vector is set up, with `l2Norm = 0`, which may be used in as.vector helper
// to detect an UNK word.
for ( let i = 0; i < docTokens.length; i += 1 ) cv.vectors[ docTokens[ i ] ] = ( awvs[ docTokens[ i ] ] || cv.unkVector ).slice( 0 );
for ( let i = 0; i < docTokensLemma.length; i += 1 ) cv.vectors[ docTokensLemma[ i ] ] = ( awvs[ docTokensLemma[ i ] ] || cv.unkVector ).slice( 0 );
for ( let i = 0; i < specificWordVectors.length; i += 1 ) {
const spWord = ( specificWordVectors[ i ] ) ? specificWordVectors[ i ].toString().trim() : false;
if ( spWord )
cv.vectors[ specificWordVectors[ i ] ] = ( awvs[ specificWordVectors[ i ] ] || cv.unkVector ).slice( 0 );
}
if ( similarWordVectors ) {
// Extract similar words on the basis of shortest Manhattan distance.
const allUniqueTokens = Object.keys( cv.vectors );
// Set up similar words array, with the size of all unique tokens.
const similarWords = new Array( allUniqueTokens.length );
// Placeholder for maintaining the similarity score based on Manhattan distance.
const similarWordsScore = new Array( allUniqueTokens.length );
// Initialize to a large distance!
similarWordsScore.fill( 1000000 );
// Initialize contextual vectors size i.e. vocab.
cv.size = allUniqueTokens.length;
// Now search each one of them in the entire word vectors space.
// Keep updating the smallest distance.
for ( let i = 0; i < allUniqueTokens.length; i += 1 ) {
const cwv = cv.vectors[ allUniqueTokens[ i ] ];
for ( const word in awvs ) { // eslint-disable-line guard-for-in
if ( word === allUniqueTokens[ i ] ) continue; // eslint-disable-line no-continue
const wv = awvs[ word ];
let distance = 0;
for ( let k = 0; k < cv.dimensions && distance < similarWordsScore[ i ]; k += 1 ) {
distance += Math.abs( cwv[ k ] - wv[ k ] );
} // Mahattan distance computation loop.
if ( distance < similarWordsScore[ i ] ) {
similarWordsScore[ i ] = distance;
similarWords[ i ] = word;
}
} // Traversing all the word vectors.
} // Traversing all the tokens in the corpus.
// Update contextual vectors using the list of similar words; also update their size.
for ( let i = 0; i < similarWords.length; i += 1 ) {
if ( cv.vectors[ similarWords[ i ] ] === undefined ) {
// Similar word must exist in `awvs`.
cv.vectors[ similarWords[ i ] ] = awvs[ similarWords[ i ] ].slice( 0 );
cv.size += 1;
}
}
} else cv.size = Object.keys( cv.vectors ).length;
// Fill the balance space, if any, on the basis of wordVectorsLimit.
for ( let i = 0; cv.size < wordVectorsLimit; i += 1 ) {
const word = docData.wordVectors.words[ i ];
if ( !cv.vectors[ word ] ) {
cv.vectors[ word ] = awvs[ word ].slice( 0 );
cv.size += 1;
}
}
// Sort words on the basis of their usage frequency.
cv.words = Object.keys( cv.vectors )
.map( ( w ) => ( { w: w, i: (cv.vectors[ w ][ cv.wordIndex ] < 0 ) ? Infinity : cv.vectors[ w ][ cv.wordIndex ] } ) )
.sort( (a, b) => a.i - b.i )
.map( ( o ) => o.w );
// Update the word index entry inside every vector.
for ( let i = 0; i < cv.size; i += 1 ) cv.vectors[ cv.words[ i ] ][ cv.wordIndex ] = i;
return JSON.stringify( cv );
}; // contextualVectors()
// Published chainable methods.

@@ -465,2 +583,4 @@ methods.entities = colEntities;

methods.contextualVectors = contextualVectors;
return methods;

@@ -467,0 +587,0 @@ };

@@ -64,2 +64,3 @@ // wink-nlp

* @param {string[]} pipe of nlp annotations.
* @param {object} wordEmbeddings object read using node require.
* @returns {object} conatining set of API methods for natural language processing.

@@ -70,3 +71,3 @@ * @example

*/
var nlp = function ( theModel, pipe ) {
var nlp = function ( theModel, pipe = null, wordEmbeddings = null ) {

@@ -216,2 +217,4 @@ var methods = Object.create( null );

rdd.cache = cache;
// Each document gets a pointer to the word vectors.
rdd.wordVectors = wordEmbeddings;
// Document's tokens; each token is represented as an array of numbers:

@@ -416,3 +419,26 @@ // ```

const tempPipe = ( pipe === undefined ) ? Object.keys( validAnnotations ) : pipe;
if ( wordEmbeddings !== null ) {
if ( !helper.isObject( wordEmbeddings ) )
throw Error( `wink-nlp: invalid word vectors, it must be an object instead found a "${typeof wordEmbeddings}".` );
let numOfKeys = 0;
const wordVectorKeys = Object.create( null );
wordVectorKeys.precision = true;
wordVectorKeys.l2NormIndex = true;
wordVectorKeys.wordIndex = true;
wordVectorKeys.dimensions = true;
wordVectorKeys.unkVector = true;
wordVectorKeys.size = true;
wordVectorKeys.words = true;
wordVectorKeys.vectors = true;
for ( const key in wordEmbeddings ) { // eslint-disable-line guard-for-in
numOfKeys += 1;
if ( !wordVectorKeys[ key ] )
throw Error( 'wink-nlp: invalid word vectors format.' );
}
if ( numOfKeys === 0 ) throw Error( 'wink-nlp: empty word vectors found.' );
}
const tempPipe = ( pipe === null || pipe === undefined ) ? Object.keys( validAnnotations ) : pipe;
if ( helper.isArray( tempPipe ) ) {

@@ -436,3 +462,23 @@ tempPipe.forEach( ( at ) => {

methods.as = asHelpers;
// Vector of a token method.
methods.vectorOf = function ( word, safe = true ) {
if ( !wordEmbeddings )
throw Error( 'wink-nlp: word vectors are not loaded, use const nlp = winkNLP( model, pipe, wordVectors ) to load.' );
const vectors = wordEmbeddings.vectors;
const unkVector = wordEmbeddings.unkVector;
const sliceUpTo = wordEmbeddings.l2NormIndex + 1;
if ( typeof word !== 'string' ) {
throw Error( 'winkNLP: input word must be of type string.' );
}
const tv = vectors[ word.toLowerCase() ];
if ( tv === undefined ) {
// If unsafe, return the entire array.
return ( safe ) ? unkVector.slice( 0, sliceUpTo ) : unkVector.slice();
}
return ( safe ) ? tv.slice( 0, sliceUpTo ) : tv.slice();
}; // vectorOf()
return methods;

@@ -439,0 +485,0 @@ }; // wink

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc