Comparing version 1.10.0 to 1.11.0
@@ -1,1 +0,1 @@ | ||
{"processes":{"bd418e9e-72f2-4293-81ff-b902fc60d267":{"parent":null,"children":[]}},"files":{"/Users/neilsbohr/dev/winkjs/wink-nlp/src/wink-nlp.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/dd-wrapper.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/constants.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/doc-v2.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/contained-entities.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/locate.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/get-parent-item.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/search.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-get-item.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-get-item.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-each.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-each.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-filter.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-filter.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-token-out.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/its.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/sort4FT.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/allowed.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/as.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/contained-markings.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-tokens-out.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-tokens-out.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-entity-out.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-entities-out.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-entities-out.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-sentence-out.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-sentences-out.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-document-out.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/print-tokens.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/cache.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/tokenizer.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/recursive-tokenizer.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/compile-trex.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/tokens-mappers.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/examples-compiler.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/automaton.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/compose-patterns.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/helper.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/utilities/bm25-vectorizer.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/utilities/allowed.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"],"/Users/neilsbohr/dev/winkjs/wink-nlp/utilities/similarity.js":["bd418e9e-72f2-4293-81ff-b902fc60d267"]},"externalIds":{}} | ||
{"processes":{"bb470559-1230-4db7-9f42-e224006fd2ad":{"parent":null,"children":[]}},"files":{"/Users/neilsbohr/dev/winkjs/wink-nlp/src/wink-nlp.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/dd-wrapper.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/constants.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/doc-v2.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/contained-entities.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/locate.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/get-parent-item.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/search.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-get-item.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-get-item.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-each.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-each.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-filter.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-filter.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-token-out.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/its.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/sort4FT.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/allowed.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/as.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/contained-markings.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-tokens-out.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-tokens-out.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-entity-out.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-entities-out.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/sel-entities-out.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-sentence-out.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/col-sentences-out.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/itm-document-out.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/api/print-tokens.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/cache.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/tokenizer.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/recursive-tokenizer.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/compile-trex.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/tokens-mappers.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/examples-compiler.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/automaton.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/compose-patterns.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/src/helper.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/utilities/bm25-vectorizer.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/utilities/allowed.js":["bb470559-1230-4db7-9f42-e224006fd2ad"],"/Users/neilsbohr/dev/winkjs/wink-nlp/utilities/similarity.js":["bb470559-1230-4db7-9f42-e224006fd2ad"]},"externalIds":{}} |
@@ -0,1 +1,9 @@ | ||
# [Enhancing custom entities & BM25Vectorizer](https://github.com/winkjs/wink-nlp/releases/tag/1.11.0) | ||
## Version 1.11.0 January 30, 2022 | ||
### ✨ Features | ||
- Obtain bag-of-words for a tokenized text from BM25Vectorizer using `.bowOf()` api — useful for bow based [similarity](https://winkjs.org/wink-nlp/similarity.html) computation. 👍 | ||
- [`learnCustomEntities()`](https://winkjs.org/wink-nlp/learn-custom-entities.html) displays a console warning, if a complex [short hand pattern](https://winkjs.org/wink-nlp/custom-entities.html) is likely to cause learning/execution slow down.🤞❗️ | ||
# [Enabling loading of BM25Vectorizer model](https://github.com/winkjs/wink-nlp/releases/tag/1.10.0) | ||
@@ -2,0 +10,0 @@ ## Version 1.10.0 November 18, 2021 |
{ | ||
"name": "wink-nlp", | ||
"version": "1.10.0", | ||
"version": "1.11.0", | ||
"description": "Developer friendly Natural Language Processing ✨", | ||
@@ -5,0 +5,0 @@ "keywords": [ |
@@ -113,2 +113,5 @@ // wink-nlp | ||
* | ||
* If a single patterns expands to a large size then it issues console | ||
* warning/error at 512/65536 level. | ||
* | ||
* @param {string} str the input string. | ||
@@ -121,2 +124,4 @@ * @return {string[]} of all possible patterns. | ||
const LIMIT1 = 512; | ||
const LIMIT2 = 65536; | ||
var quotedTextElems = extractEnclosedText( str ); | ||
@@ -131,2 +136,16 @@ var patterns = []; | ||
// Compute the size of the array that will be produced as a result of processing | ||
// the pattern. | ||
const size = patterns.reduce( ( ( prev, curr ) => prev * curr.length ), 1 ); | ||
// Issue warning/error if the size is prohibitively large from the end-user | ||
// prespective. Note: while winkNLP can handle even larger sizes, it can | ||
// still break down in the event of explosion! | ||
if ( size > LIMIT1 && size < LIMIT2 ) { | ||
console.warn( 'winkNLP: complex pattern detected, consider simplifying it!' ); | ||
} else if ( size > LIMIT2 ) console.error( | ||
'winkNLP: very complex pattern detected, please review and simplify.\n' + | ||
' === It may slow down further execution! ===\n\n' | ||
); | ||
product( patterns ).forEach( function ( e ) { | ||
@@ -133,0 +152,0 @@ finalPatterns.push( e.join( ' ' ).trim().split( /\s+/ ) ); |
@@ -307,3 +307,3 @@ // Minimum TypeScript Version: 4.0 | ||
import { Tokens, Document, ItsFunction } from 'wink-nlp'; | ||
import { Tokens, Document, ItsFunction, Bow } from 'wink-nlp'; | ||
@@ -324,2 +324,3 @@ export type Norm = "l1" | "l2" | "none"; | ||
vectorOf(tokens: Tokens): number[]; | ||
bowOf(tokens: Tokens): Bow; | ||
config(): BM25VectorizerConfig; | ||
@@ -326,0 +327,0 @@ loadModel(json: string): void; |
@@ -279,3 +279,3 @@ // wink-nlp | ||
* the tf-idf learned so far. | ||
* @param {string} tokens tokenized document, usually obtained via winkNLP. | ||
* @param {string[]} tokens tokenized document, usually obtained via winkNLP. | ||
* @return {number[]} its vector. | ||
@@ -306,5 +306,43 @@ */ | ||
// `thisNorm || 1` ensures that there is no attempt to divide by zero! | ||
// This may happen if all tokens are unseen. | ||
return arr.map( ( v ) => +( v / ( thisNorm || 1 ) ).toFixed( precision ) ); | ||
}; // vectorOf() | ||
// ## bowOf | ||
/** | ||
* Computes the bag-of-words (bowOf) of the input document, using the tf-idf | ||
* learned so far. | ||
* @param {string[]} tokens tokenized text, usually obtained via winkNLP. | ||
* @return {object} its bow. | ||
*/ | ||
methods.bowOf = function ( tokens ) { | ||
computeWeights(); | ||
const bow = Object.create( null ); | ||
const avgDL = sumOfAllDLs / docId; | ||
let thisNorm = 0; | ||
for ( let i = 0; i < tokens.length; i += 1 ) { | ||
const t = tokens[ i ]; | ||
// bow applies only if the token is not an unseen one! | ||
if ( idf[ t ] ) bow[ t ] = 1 + ( bow[ t ] || 0 ); | ||
} | ||
for ( const t in bow ) { // eslint-disable-line guard-for-in | ||
bow[ t ] = idf[ t ] * ( ( k1 + 1 ) * bow[ t ] ) / ( ( k1 * ( 1 - b + ( b * ( tokens.length / avgDL ) ) ) ) + bow[ t ] ); | ||
thisNorm += normFn[ norm ]( bow[ t ] ); | ||
} | ||
if ( norm === L2 ) { | ||
thisNorm = Math.sqrt( thisNorm ); | ||
} else if ( norm === NONE ) thisNorm = 1; | ||
for ( const t in bow ) { // eslint-disable-line guard-for-in | ||
// Unlike in `vectorOf`, `thisNorm || 1` is not needed here as bow will be | ||
// empty if `thisNorm` is zero! | ||
bow[ t ] = +( bow[ t ] / thisNorm ).toFixed( precision ); | ||
} | ||
return bow; | ||
}; // bowOf() | ||
methods.config = ( () => ( { k: k, k1: k1, b: b, norm: norm } ) ); | ||
@@ -311,0 +349,0 @@ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
557070
5924