Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

@telefonica/language-model-converter

Package Overview
Dependencies
Maintainers
16
Versions
23
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@telefonica/language-model-converter - npm Package Compare versions

Comparing version 2.5.1 to 2.6.0

src/chai-eventemitter.d.ts

7

lib/index.js

@@ -27,2 +27,9 @@ "use strict";

try {
parser.on('warning', (msg) => {
console.error(`WARNING: ${msg}`);
});
parser.on('error', (msg) => {
console.error(`ERROR: ${msg}`);
process.exit(1);
});
let luisModel = parser.parse(cli.files, cli.culture);

@@ -29,0 +36,0 @@ console.log(JSON.stringify(luisModel, null, 2));

15

lib/parser.d.ts

@@ -0,14 +1,21 @@

/// <reference types="node" />
import { EventEmitter } from 'events';
import { Luis } from './luis-model';
export declare type culture = 'en-us' | 'es-es';
export declare class LanguageModelParser {
export declare class LanguageModelParser extends EventEmitter {
private doc;
culture: culture;
parse(files: string[], culture: culture): Luis.Model;
private expandVariables(sentence, variables);
private searchMissedVariables(sentence, variables, missedVariables);
private expandVariables(sentence, variables, usedVariables);
private extractEntities(sentence);
private registerEntity(entity, entitiesMap);
private normalizeSentence(sentence);
private wordCount(sentence);
private tokenize(sentence);
private static wordCount(sentence);
private static splitSentenceByTokens(sentence);
private static tokenize(sentence);
private buildUtterance(sentence, intent);
private warnAboutDuplicates(obj, prefix?);
private emitWarning(msg);
private emitError(msg);
}

@@ -5,4 +5,7 @@ "use strict";

const _ = require("lodash");
class LanguageModelParser {
const events_1 = require("events");
const MIN_EXAMPLES_PER_INTENT = 3;
class LanguageModelParser extends events_1.EventEmitter {
constructor() {
super(...arguments);
this.doc = {};

@@ -13,7 +16,9 @@ }

try {
let yamlFileContents = fs.readFileSync(file, 'utf8');
mergeDeep(this.doc, yaml.safeLoad(yamlFileContents));
let yamlFileContents = yaml.safeLoad(fs.readFileSync(file, 'utf8'));
this.warnAboutDuplicates(yamlFileContents);
mergeDeep(this.doc, yamlFileContents);
}
catch (err) {
throw new Error('File "' + file + '": ' + err.message);
let e = `File "${file}": ${err.message}`;
this.emitError(e);
}

@@ -43,3 +48,4 @@ });

if (intentName.length > 50) {
throw new Error(`Intent "${intentName}" should be less than 50 characters. was ${intentName.length}`);
let err = `Intent "${intentName}" should be less than 50 characters. was ${intentName.length}`;
this.emitError(err);
}

@@ -49,12 +55,15 @@ return intentName;

let replacements = new Map();
keys.filter(intentName => intentName.startsWith('list.'))
.forEach(intentName => {
replacements.set(intentName.slice('list.${'.length, -1), this.doc[intentName]);
keys.filter(listKey => listKey.startsWith('list.'))
.forEach(listKey => {
replacements.set(listKey.slice('list.${'.length, -1), this.doc[listKey]);
});
let usedReplacements = new Set();
let missedReplacements = new Set();
let entitiesMap = new Map();
let utterances = new Set();
let utterancesMap = new Map();
intentNames.forEach(intent => {
let sentences = this.doc[intent];
sentences
.map((sentence) => this.expandVariables(sentence, replacements))
.map((sentence) => this.searchMissedVariables(sentence, replacements, missedReplacements))
.map((sentence) => this.expandVariables(sentence, replacements, usedReplacements))
.reduce((a, b) => a.concat(b))

@@ -64,5 +73,30 @@ .forEach((sentence) => {

utterance.entities.forEach(entity => this.registerEntity(entity, entitiesMap));
utterances.add(utterance);
if (utterancesMap.has(utterance.text)) {
let err = `Utterance "${utterance.text}" is assigned to ` +
`both "${utterancesMap.get(utterance.text).intent}" and "${utterance.intent}" intents`;
this.emitError(err);
}
utterancesMap.set(utterance.text, utterance);
});
});
let utterances = Array.from(utterancesMap.values());
let examplesPerIntent = _.countBy(utterances, 'intent');
let intentsWithTooManyExamples = _.pickBy(examplesPerIntent, (counter) => counter < MIN_EXAMPLES_PER_INTENT);
if (!_.isEmpty(intentsWithTooManyExamples)) {
let err = `The following intents have less than ${MIN_EXAMPLES_PER_INTENT} examples:\n` +
_.keys(intentsWithTooManyExamples).map(intent => ` - ${intent}`).join('\n');
this.emitError(err);
}
if (usedReplacements.size < replacements.size) {
replacements.forEach((values, key) => {
if (!usedReplacements.has(key)) {
this.emitWarning(`The list "list.$\{${key}\}" has not been used in any sentence.`);
}
});
}
if (missedReplacements.size > 0) {
missedReplacements.forEach(value => {
this.emitWarning(`The list "list.$\{${value}\}" is being used from some sentences but it has not been declared.`);
});
}
let features = _.toPairs(this.doc.phraselist)

@@ -77,5 +111,6 @@ .map(value => {

if (strword.indexOf(',') !== -1) {
throw new Error(`Prashe list "${name}" can not contain commas ('${strword}')`);
let err = `Phrase list "${name}" can not contain commas ('${strword}')`;
this.emitError(err);
}
return this.tokenize(strword).join(' ');
return LanguageModelParser.tokenize(strword).join(' ');
})

@@ -91,3 +126,3 @@ .join(',');

let bingEntities = this.doc.builtin || [];
luisModel.utterances = Array.from(utterances.values());
luisModel.utterances = utterances;
luisModel.entities = Array.from(entitiesMap.values());

@@ -99,3 +134,14 @@ luisModel.intents = intentNames.map(intent => ({ name: intent }));

}
expandVariables(sentence, variables) {
searchMissedVariables(sentence, variables, missedVariables) {
let match = sentence.match(/\${(.+?)}/);
if (match) {
for (let i = 1; i < match.length; i++) {
if (!variables.has(match[i])) {
missedVariables.add(match[i]);
}
}
}
return sentence;
}
expandVariables(sentence, variables, usedVariables) {
let expandedSentences = new Set([sentence]);

@@ -107,2 +153,3 @@ expandedSentences.forEach(sentence => {

if (sentence.indexOf(search) !== -1) {
usedVariables.add(key);
let newSentence = sentence.replace(search, value);

@@ -167,19 +214,46 @@ if (newSentence !== sentence) {

}
wordCount(sentence) {
return this.tokenize(sentence).length;
static wordCount(sentence) {
return LanguageModelParser.tokenize(sentence).length;
}
tokenize(sentence) {
if (sentence === '') {
static splitSentenceByTokens(sentence) {
if (!sentence || sentence.trim().length === 0) {
return [];
}
let tokenized = String(sentence)
.replace(/[^\w\u00C0-\u017F]/g, capture => ` ${capture} `)
.replace(/_/g, capture => ` ${capture} `)
.replace(' º ', 'º')
.replace(' ª ', 'ª')
.replace(/\s\s+/g, ' ')
.trim()
.split(' ');
return tokenized;
sentence = sentence.replace(/[\s\uFEFF\xA0]+$/g, '');
const WORD_CHARS = '0-9A-Za-z' +
'ªº' +
'\u00B5' +
'\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02AF' +
'\u02B0-\u02C1' +
'\u0370-\u0374\u0376-\u0377\u037A-\u037D\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03FF' +
'\u0400-\u0481\u048A-\u0523';
const WORD = new RegExp(`^[${WORD_CHARS}]+`);
const NON_WORD = new RegExp(`^[^\s${WORD_CHARS}]`);
let tokens = [];
let sentenceIndex = 0;
while (sentence.length) {
let leadingSpaces = sentence.match(/^\s*/)[0].length;
sentenceIndex += leadingSpaces;
sentence = sentence.slice(leadingSpaces);
let tokenRegExpRes = sentence.match(WORD);
if (!tokenRegExpRes) {
tokenRegExpRes = sentence.match(NON_WORD);
}
if (!tokenRegExpRes) {
throw new Error(`The sentence ${sentence} cannot be classified as word or non-word`);
}
let token = tokenRegExpRes[0];
tokens.push({
token: token,
startChar: sentenceIndex,
endChar: sentenceIndex + token.length - 1
});
sentenceIndex += token.length;
sentence = sentence.slice(token.length);
}
return tokens;
}
static tokenize(sentence) {
return LanguageModelParser.splitSentenceByTokens(sentence).map(token => token.token);
}
buildUtterance(sentence, intent) {

@@ -195,5 +269,5 @@ let entities = [];

extractedEntities.forEach(entity => {
let startPos = this.wordCount(parts);
let startPos = LanguageModelParser.wordCount(parts);
parts += entity.entityValue;
let endPos = this.wordCount(parts) - 1;
let endPos = LanguageModelParser.wordCount(parts) - 1;
entities.push({

@@ -213,6 +287,27 @@ entity: entity.entityType,

intent,
entities: entities
entities
};
return utterance;
}
warnAboutDuplicates(obj, prefix = []) {
const duplicates = (array) => _.uniq(_.filter(array, (v, i, col) => _.includes(col, v, i + 1)));
_.forEach(obj, (value, key) => {
if (value && Array.isArray(value)) {
let duplicatedValues = duplicates(value);
if (duplicatedValues.length) {
let breadcrumb = prefix.concat(key).join(' -> ');
this.emitWarning(`The key "${breadcrumb}" has the following duplicated values: ${duplicatedValues.join(', ')}`);
}
}
else if (isObject(value)) {
this.warnAboutDuplicates(value, prefix.concat(key));
}
});
}
emitWarning(msg) {
this.emit('warning', msg);
}
emitError(msg) {
this.emit('error', msg);
}
}

@@ -219,0 +314,0 @@ exports.LanguageModelParser = LanguageModelParser;

{
"name": "@telefonica/language-model-converter",
"version": "2.5.1",
"version": "2.6.0",
"description": "Language model converter yaml <-> json for LUIS",

@@ -40,2 +40,3 @@ "license": "Apache-2.0",

"chai": "^3.5.0",
"chai-eventemitter": "^1.1.1",
"mocha": "^3.0.2",

@@ -42,0 +43,0 @@ "nock": "^8.0.0",

@@ -56,2 +56,10 @@ /**

try {
parser.on('warning', (msg: string) => {
console.error(`WARNING: ${msg}`);
});
parser.on('error', (msg: string) => {
console.error(`ERROR: ${msg}`);
process.exit(1);
});
let luisModel = parser.parse(cli.files, cli.culture as culture);

@@ -58,0 +66,0 @@ console.log(JSON.stringify(luisModel, null, 2));

@@ -22,8 +22,17 @@ /**

import * as _ from 'lodash';
import { EventEmitter } from 'events';
import { Luis } from './luis-model';
interface Token {
token: string;
startChar: number;
endChar: number;
}
const MIN_EXAMPLES_PER_INTENT = 3;
export type culture = 'en-us' | 'es-es';
export class LanguageModelParser {
export class LanguageModelParser extends EventEmitter {
private doc: any = {};

@@ -36,6 +45,12 @@ public culture: culture;

try {
let yamlFileContents = fs.readFileSync(file, 'utf8');
mergeDeep(this.doc, yaml.safeLoad(yamlFileContents));
let yamlFileContents = yaml.safeLoad(fs.readFileSync(file, 'utf8'));
// Look for repeated elements in lists for each file and warn about it.
// Note that when merging, new duplicates could appear but those will be
// silently removed. The goal is to only warn at a file level.
this.warnAboutDuplicates(yamlFileContents);
mergeDeep(this.doc, yamlFileContents);
} catch (err) {
throw new Error('File "' + file + '": ' + err.message);
let e = `File "${file}": ${err.message}`;
this.emitError(e);
}

@@ -64,9 +79,10 @@ });

let intentNames = keys
// remove the lists: lines starting by "list." that are not intents
.filter(intentName => !intentName.startsWith('list.'))
.filter(intentName => !intentName.startsWith('phraselist'))
.filter(intentName => !intentName.startsWith('builtin'))
// remove the lists: lines starting by "list." that are not intents
.map(intentName => {
if (intentName.length > 50) {
throw new Error(`Intent "${intentName}" should be less than 50 characters. was ${intentName.length}`);
let err = `Intent "${intentName}" should be less than 50 characters. was ${intentName.length}`;
this.emitError(err);
}

@@ -77,12 +93,14 @@ return intentName;

let replacements = new Map<string, string[]>();
keys.filter(intentName => intentName.startsWith('list.'))
.forEach(intentName => {
keys.filter(listKey => listKey.startsWith('list.'))
.forEach(listKey => {
replacements.set(
intentName.slice('list.${'.length, -1),
this.doc[intentName]
listKey.slice('list.${'.length, -1),
this.doc[listKey]
);
});
let usedReplacements = new Set<string>();
let missedReplacements = new Set<string>();
let entitiesMap = new Map<string, Luis.Entity>();
let utterances = new Set<Luis.Utterance>();
let utterancesMap = new Map<string, Luis.Utterance>();

@@ -93,3 +111,4 @@ intentNames.forEach(intent => {

sentences
.map((sentence: string) => this.expandVariables(sentence, replacements))
.map((sentence: string) => this.searchMissedVariables(sentence, replacements, missedReplacements))
.map((sentence: string) => this.expandVariables(sentence, replacements, usedReplacements))
.reduce((a: string[], b: string[]) => a.concat(b)) // flatten arrays

@@ -99,6 +118,38 @@ .forEach((sentence: string) => {

utterance.entities.forEach(entity => this.registerEntity(entity, entitiesMap));
utterances.add(utterance);
if (utterancesMap.has(utterance.text)) {
let err = `Utterance "${utterance.text}" is assigned to ` +
`both "${utterancesMap.get(utterance.text).intent}" and "${utterance.intent}" intents`;
this.emitError(err);
}
utterancesMap.set(utterance.text, utterance);
});
});
let utterances = Array.from(utterancesMap.values());
// Look for intents with too many examples
let examplesPerIntent = _.countBy(utterances, 'intent');
let intentsWithTooManyExamples = _.pickBy(examplesPerIntent, (counter: number) => counter < MIN_EXAMPLES_PER_INTENT);
if (!_.isEmpty(intentsWithTooManyExamples)) {
let err = `The following intents have less than ${MIN_EXAMPLES_PER_INTENT} examples:\n` +
_.keys(intentsWithTooManyExamples).map(intent => ` - ${intent}`).join('\n');
this.emitError(err);
}
// Print warnings about unused lists
if (usedReplacements.size < replacements.size) {
replacements.forEach((values, key) => {
if (!usedReplacements.has(key)) {
this.emitWarning(`The list "list.$\{${key}\}" has not been used in any sentence.`);
}
});
}
// Print warnings about sentences with list placeholders that points to an non-existent list
if (missedReplacements.size > 0) {
missedReplacements.forEach(value => {
this.emitWarning(`The list "list.$\{${value}\}" is being used from some sentences but it has not been declared.`);
});
}
let features = _.toPairs(this.doc.phraselist)

@@ -113,5 +164,6 @@ .map(value => {

if (strword.indexOf(',') !== -1) {
throw new Error(`Prashe list "${name}" can not contain commas ('${strword}')`);
let err = `Phrase list "${name}" can not contain commas ('${strword}')`;
this.emitError(err);
}
return this.tokenize(strword).join(' ');
return LanguageModelParser.tokenize(strword).join(' ');
})

@@ -130,3 +182,3 @@ .join(',');

luisModel.utterances = Array.from(utterances.values());
luisModel.utterances = utterances;
luisModel.entities = Array.from(entitiesMap.values());

@@ -140,5 +192,16 @@ luisModel.intents = intentNames.map(intent => <Luis.Intent>{ name: intent });

private expandVariables(sentence: string, variables: Map<string, string[]>): string[] {
private searchMissedVariables(sentence: string, variables: Map<string, string[]>, missedVariables: Set<string>): string  {
let match = sentence.match(/\${(.+?)}/);
if (match) {
for (let i = 1; i < match.length; i++) {
if (!variables.has(match[i])) {
missedVariables.add(match[i]);
}
}
}
return sentence;
}
private expandVariables(sentence: string, variables: Map<string, string[]>, usedVariables: Set<string>): string[] {
let expandedSentences = new Set([sentence]);
expandedSentences.forEach(sentence => {
expandedSentences.forEach(sentence => {
variables.forEach((values, key) => {

@@ -148,2 +211,3 @@ values.forEach(value => {

if (sentence.indexOf(search) !== -1) {
usedVariables.add(key);
let newSentence = sentence.replace(search, value);

@@ -225,28 +289,77 @@ if (newSentence !== sentence) {

private wordCount(sentence: string): number {
return this.tokenize(sentence).length;
private static wordCount(sentence: string): number {
return LanguageModelParser.tokenize(sentence).length;
}
private tokenize(sentence: string): string[] {
if (sentence === '') {
/**
* Tokenize a sentence following the LUIS rules returning the tokens and delimiters.
* TODO: Memoize this function.
*/
private static splitSentenceByTokens(sentence: string): Token[] {
if (!sentence || sentence.trim().length === 0) {
return [];
}
sentence = sentence.replace(/[\s\uFEFF\xA0]+$/g, ''); // Right trim
// separate non-word chars the same way MS does (ex. 'a,b,c' -> 'a , b , c')
let tokenized = String(sentence)
// ^\w\u00C0-\u017F means a not word, including accented chars
// (see http://stackoverflow.com/a/11550799/12388)
.replace(/[^\w\u00C0-\u017F]/g, capture => ` ${capture} `)
.replace(/_/g, capture => ` ${capture} `)
// omit non-word exceptions not handled by microsoft ('º' and 'ª')
.replace(' º ', 'º')
.replace(' ª ', 'ª')
// replace multiple spaces with a single one
.replace(/\s\s+/g, ' ')
.trim()
.split(' ');
// The following is a RegExp that contains the UTF-8 characters (http://www.utf8-chartable.de/unicode-utf8-table.pl)
// that are understood by LUIS as part of a word. Chars not included here
// are considered as separated words by LUIS and so as independent tokens
const WORD_CHARS =
'0-9A-Za-z' + // Numbers and English letters
'ªº' + // Ordinal indicators
'\u00B5' + // Micro sign
'\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02AF' + // Non-english latin letters (accents and others)
'\u02B0-\u02C1' + // Modifier letters
'\u0370-\u0374\u0376-\u0377\u037A-\u037D\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03FF' + // Greek and Coptic alphabets
'\u0400-\u0481\u048A-\u0523' // Cyrillic alphabet
// Leaving the remaining alphabets for another brave person
;
// A word is any number > 0 of WORD_CHARS
const WORD = new RegExp(`^[${WORD_CHARS}]+`);
// A non-word is any character not in WORD_CHARS and not a space
const NON_WORD = new RegExp(`^[^\s${WORD_CHARS}]`);
return tokenized;
let tokens: Token[] = [];
// Walk through the sentence consuming chunks that matches WORD or NON_WORD
let sentenceIndex = 0;
while (sentence.length) {
// Ignore spaces at the beginning of the remaining sentence
let leadingSpaces = sentence.match(/^\s*/)[0].length;
// Consume the spaces
sentenceIndex += leadingSpaces;
sentence = sentence.slice(leadingSpaces);
// Try a word
let tokenRegExpRes = sentence.match(WORD);
if (!tokenRegExpRes) {
// If not a word, try a non-word
tokenRegExpRes = sentence.match(NON_WORD);
}
if (!tokenRegExpRes) {
// If not word nor non-word... It should be impossible
throw new Error(`The sentence ${sentence} cannot be classified as word or non-word`);
}
let token = tokenRegExpRes[0];
tokens.push({
token: token,
startChar: sentenceIndex,
endChar: sentenceIndex + token.length - 1
});
// Consume the recognized token
sentenceIndex += token.length;
sentence = sentence.slice(token.length);
}
return tokens;
}
/**
* Tokenize a sentence following the LUIS rules and return an array of strings
*/
private static tokenize(sentence: string): string[] {
return LanguageModelParser.splitSentenceByTokens(sentence).map(token => token.token);
}
private buildUtterance(sentence: string, intent: string) {

@@ -266,5 +379,5 @@ let entities: any[] = [];

extractedEntities.forEach(entity => {
let startPos = this.wordCount(parts);
let startPos = LanguageModelParser.wordCount(parts);
parts += entity.entityValue;
let endPos = this.wordCount(parts) - 1;
let endPos = LanguageModelParser.wordCount(parts) - 1;
entities.push({

@@ -284,3 +397,3 @@ entity: entity.entityType,

intent,
entities: entities
entities
};

@@ -290,2 +403,26 @@

}
private warnAboutDuplicates(obj: any, prefix: string[] = []) {
const duplicates = (array: any[]) => _.uniq(_.filter(array, (v, i, col) => _.includes(col, v, i + 1)));
_.forEach(obj, (value, key) => {
if (value && Array.isArray(value)) {
let duplicatedValues = duplicates(value);
if (duplicatedValues.length) {
let breadcrumb = prefix.concat(key).join(' -> ');
this.emitWarning(`The key "${breadcrumb}" has the following duplicated values: ${duplicatedValues.join(', ')}`);
}
} else if (isObject(value)) {
this.warnAboutDuplicates(value, prefix.concat(key));
}
});
}
private emitWarning(msg: string) {
this.emit('warning', msg);
}
private emitError(msg: string) {
this.emit('error', msg);
}
}

@@ -292,0 +429,0 @@

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc