api-ai-speech-context
Advanced tools
Comparing version
const fs = require('fs'); | ||
const agentDownloader = require('api-ai-agent-downloader'); | ||
const { MOST_COMMON_10K_WORDS, generateSpeechContext } = require('./index'); | ||
const { generateSpeechContext } = require('./index'); | ||
@@ -23,7 +23,8 @@ | ||
const developerToken = 'abc123'; | ||
const blacklist = MOST_COMMON_10K_WORDS.concat(['more', 'bad', 'words']); | ||
const blacklist = ['bad', 'words', 'and phrases']; | ||
const removeCommonWords = true; // true by default | ||
getAgentSummaryPromise(agentName, developerToken) | ||
.then(summary => { | ||
const context = generateSpeechContext(summary, blacklist); | ||
const context = generateSpeechContext(summary, blacklist, removeCommonWords); | ||
console.log(`=> Generated speech context for ${agentName}.`); | ||
@@ -30,0 +31,0 @@ fs.writeFileSync('./speechContext.json', JSON.stringify(context)); |
154
index.js
const fs = require('fs'); | ||
const generateSpeechContext = require('./generateSpeechContext'); | ||
const commonWordsFile = fs.readFileSync('./data/mostCommonWordsFromGoogle.txt', 'utf8'); | ||
const MOST_COMMON_10K_WORDS = commonWordsFile.toString().split('\n'); | ||
module.exports = { | ||
MOST_COMMON_10K_WORDS, | ||
generateSpeechContext, | ||
// Google Cloud Speech API context limits. | ||
// https://cloud.google.com/speech/limits#content | ||
const CHAR_COUNT_LIMIT = 10000; | ||
const PHRASES_LIMIT = 500; | ||
const CHAR_COUNT_PER_PHRASE_LIMIT = 100; | ||
const ENTITY_REGEXP = /(^| )@([^ ]+)/g; | ||
const replaceEntityLabels = (template, char) => { | ||
if (!template) return ''; | ||
return template.replace(ENTITY_REGEXP, char); | ||
} | ||
const flatten = (nestedArray) => { | ||
return [].concat.apply([], nestedArray); | ||
} | ||
const deduplicate = (array) => { | ||
var seen = {}; | ||
return array.filter(function(item) { | ||
return seen.hasOwnProperty(item) ? false : (seen[item] = true); | ||
}); | ||
} | ||
const removeNonWords = (array) => { | ||
const processed = array.map(function(item) { | ||
if (!item) return; | ||
return item.replace(/[.,\/?#!$%\^&\*;:{}=\-_`~()]/g,'').replace(/\s{2,}/g,' ').trim(); | ||
}); | ||
return processed.filter(function(item) { | ||
return Boolean(item); | ||
}); | ||
} | ||
const removeTooLong = (array, maxLength) => { | ||
return array.filter(item => item.length <= maxLength); | ||
} | ||
const getEntityPhrases = (entities) => { | ||
const nestedPhrases = entities.map((entity) => { | ||
if (!entity.entries) return; | ||
const synonymsInArrays = entity.entries.map((entry) => { | ||
let synonyms = entry.synonyms; | ||
if ((typeof synonyms) === 'string') { | ||
synonyms = [synonyms]; | ||
} | ||
return synonyms; | ||
}); | ||
return flatten(synonymsInArrays); | ||
}); | ||
const phrases = flatten(nestedPhrases); | ||
const phrasesInParts = phrases.map((phrase) => { | ||
return replaceEntityLabels(phrase, '~').split('~'); | ||
}); | ||
return flatten(phrasesInParts); | ||
} | ||
const getIntentPhrases = (intents) => { | ||
const nestedPhrases = intents.map((intent) => { | ||
if (!intent.templates) return; | ||
const templatesInParts = intent.templates.map((template) => { | ||
return replaceEntityLabels(template, '~').split('~'); | ||
}); | ||
return flatten(templatesInParts); | ||
}); | ||
return flatten(nestedPhrases); | ||
} | ||
const ensureStringifiedCharCountWithinLimit = (array, limit) => { | ||
// Binary search to find the the highest index with char count under the limit. | ||
let lower = 0; | ||
let upper = array.length; | ||
let bestIndex = 0; | ||
let bestCharCount = 0; | ||
while (lower <= upper) { | ||
const index = Math.floor((upper + lower) / 2); | ||
const trimmed = array.slice(0, index); | ||
const charCount = JSON.stringify(trimmed).length; | ||
if (charCount < limit) { | ||
lower = index + 1; | ||
} else { | ||
upper = index - 1; | ||
} | ||
if (charCount <= limit && charCount > bestCharCount) { | ||
bestCharCount = charCount; | ||
bestIndex = index; | ||
} | ||
} | ||
return array.slice(0, bestIndex); | ||
} | ||
const getAllPhrases = (agentSummary) => { | ||
const entityPhrases = getEntityPhrases(agentSummary.entities); | ||
const intentPhrases = getIntentPhrases(agentSummary.intents); | ||
let allPhrases = entityPhrases.concat(intentPhrases); | ||
allPhrases = allPhrases.filter(phrase => typeof phrase === 'string'); | ||
allPhrases = allPhrases.map(phrase => phrase.toLowerCase()); | ||
allPhrases = removeNonWords(allPhrases); | ||
allPhrases = removeTooLong(allPhrases, 100); | ||
allPhrases = deduplicate(allPhrases); | ||
return allPhrases; | ||
}; | ||
module.exports.generateSpeechContext = (agentSummary, blacklist, removeCommonWords = true) => { | ||
const allPhrases = getAllPhrases(agentSummary); | ||
const words = []; | ||
const acronyms = []; // e.g. 'f o o b a r' | ||
const phrases = []; | ||
allPhrases.forEach((phrase) => { | ||
if (phrase.indexOf(' ') === -1) { | ||
words.push(phrase); | ||
} else if (phrase.match(/^(\w )+\w$/)) { | ||
acronyms.push(phrase); | ||
} else { | ||
phrases.push(phrase); | ||
} | ||
}); | ||
const collapsedAcronyms = acronyms.map(acronym => acronym.replace(/ /g, '')); | ||
const acronymsSet = new Set(collapsedAcronyms); | ||
const commonSet = new Set(MOST_COMMON_10K_WORDS); | ||
const blacklistSet = new Set(blacklist); | ||
const isNotBlacklisted = word => !blacklistSet.has(word); | ||
const isNotAcronym = word => !acronymsSet.has(word); | ||
let chosenWords = words.filter(isNotBlacklisted).filter(isNotAcronym); | ||
if (removeCommonWords) { | ||
chosenWords = chosenWords.filter(word => !commonSet.has(word)); | ||
} | ||
const chosenPhrases = phrases.filter((phrase) => { | ||
let keep = !blacklistSet.has(phrase) && phrase.length < CHAR_COUNT_PER_PHRASE_LIMIT; | ||
if (removeCommonWords) { | ||
const wordsInPhrase = phrase.split(' '); | ||
const containsRareWord = !wordsInPhrase.every(word => commonSet.has(word)); | ||
keep = keep && containsRareWord; | ||
} | ||
return keep; | ||
}); | ||
const context = chosenWords.concat(chosenPhrases).concat(acronyms).splice(0, 500); | ||
return ensureStringifiedCharCountWithinLimit(context, CHAR_COUNT_LIMIT); | ||
}; |
{ | ||
"name": "api-ai-speech-context ", | ||
"version": "0.0.2", | ||
"version": "0.0.3", | ||
"description": "Generates a Google Cloud Speech API SpeechContext for an API.AI agent.", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
93762
12.04%158
1.28%2
100%