@chattylabs/language-detection
Advanced tools
Comparing version 0.1.0 to 0.1.1
{ | ||
"name": "@chattylabs/language-detection", | ||
"version": "0.1.0", | ||
"description": "Library that helps detecting the language of a given piece of text.", | ||
"version": "0.1.1", | ||
"description": "Package to detect the language of a given text (focusing on short sms type text used on tweets, facebook, WhatsApp, etc)", | ||
"main": "src/index.js", | ||
"repository": { | ||
"type": "git", | ||
"url": "git+https://github.com/danielantelo/language-detector.git" | ||
"url": "git+https://github.com/chattylabs/language-detector" | ||
}, | ||
@@ -10,0 +10,0 @@ "author": { |
@@ -24,3 +24,3 @@ # Language Detector | ||
``` | ||
const detect = require('customisable-language-detection') | ||
const detect = require('@chattylabs/language-detection') | ||
const result = detect('some text to detect') | ||
@@ -33,3 +33,3 @@ const language = result.language | ||
``` | ||
const detect = require('customisable-language-detection') | ||
const detect = require('@chattylabs/language-detection') | ||
const customLanguageProfiles = require('../path/to/data/languageProfiles.json') | ||
@@ -48,3 +48,3 @@ | ||
const combinedProfiles = { | ||
...require('customisable-language-detection').languageProfiles, | ||
...require('@chattylabs/language-detection').languageProfiles, | ||
...customLanguageProfiles | ||
@@ -62,3 +62,3 @@ } | ||
// bin/train.js | ||
const train = require('customisable-language-detection').train | ||
const train = require('@chattylabs/language-detection').train | ||
train('./path/to/custom/samples/*.txt', './path/to/custom/export/languageProfiles.json') | ||
@@ -75,3 +75,3 @@ ``` | ||
``` | ||
const detect = require('customisable-language-detection') | ||
const detect = require('@chattylabs/language-detection') | ||
const customLanguageProfiles = require('../path/to/data/languageProfiles.json') | ||
@@ -115,3 +115,3 @@ const customReducers = require('../path/to/your/reducers') | ||
const combinedProfiles = { | ||
...require('customisable-language-detection').reducers, | ||
...require('@chattylabs/language-detection').reducers, | ||
...customReducers | ||
@@ -118,0 +118,0 @@ } |
@@ -5,3 +5,3 @@ const reducer = require('./reducer') | ||
module.exports = (text, opts) => { | ||
module.exports = (text, opts= {}) => { | ||
const { languageProfiles = defaultLanguageProfiles, reducers } = opts | ||
@@ -18,3 +18,3 @@ const allLanguages = Object.keys(languageProfiles) | ||
const languagesIntersection = () => allLanguages.filter(lang => -1 !== reducedLanguages.indexOf(lang)) | ||
const languages = reducedLanguages.length > 1 ? languagesIntersection : allLanguages; | ||
const languages = reducedLanguages.length > 1 ? languagesIntersection() : allLanguages; | ||
const inputProfile = profiler(text) | ||
@@ -24,3 +24,4 @@ const scores = {} | ||
languages.forEach(language => { | ||
const found = languageProfiles[language].findIndex(entry => entry.token === ngram.token) | ||
const foundPos = languageProfiles[language].findIndex(entry => entry.token === ngram.token) | ||
const found = foundPos >= 0 | ||
const penalty = found ? Math.abs(found - index) : 1000 | ||
@@ -33,3 +34,3 @@ language in scores ? (scores[language] -= penalty) : (scores[language] = 0 - penalty) | ||
.map(language => ({ language: language, score: scores[language] })) | ||
.sort((first, second) => first.score - second.score) | ||
.sort((first, second) => second.score - first.score) | ||
@@ -36,0 +37,0 @@ const bestMatchParts = sorted[0].language.split('_'); |
const defaultReducers = require('./utils/reducers') | ||
module.exports = (text, reducers = defaultReducers) => { | ||
let reduced; | ||
let reduced = []; | ||
reducers.forEach((current) => { | ||
@@ -9,8 +9,3 @@ if (!current.regex.test(text)) { | ||
} | ||
const currentLanguages = new Set(current.languages) | ||
if (!reduced) { | ||
reduced = currentLanguages | ||
} else { | ||
reduced = new Set([...reduced].filter(lang => currentLanguages.has(lang))); | ||
} | ||
reduced = new Set([...reduced, ...current.languages]) | ||
}) | ||
@@ -17,0 +12,0 @@ |
// @TODO | ||
module.exports = [ | ||
{ | ||
regex: /[ñ]/i, | ||
languages: ['es', 'gn', 'gl'] | ||
}, | ||
{ | ||
regex: /\bik\b/i, | ||
languages: ['nl'] | ||
}, | ||
{ | ||
regex: /\bich\b/i, | ||
languages: ['de'] | ||
}, | ||
{ | ||
regex: /ß/i, | ||
languages: ['de'] | ||
}, | ||
{ | ||
regex: /\bczy\b/i, | ||
languages: ['pl'] | ||
}, | ||
{ | ||
regex: /[Ł|ń|ś|ź]/i, | ||
languages: ['pl'] | ||
}, | ||
{ | ||
regex: /å/i, | ||
languages: ['nb', 'nn', 'fo', 'is', 'da', 'sv'] | ||
}, | ||
{ | ||
regex: /\baf\b/i, | ||
languages: ['da'] | ||
}, | ||
{ | ||
regex: /\bnei\b/i, | ||
languages: ['nb', 'nn'] | ||
}, | ||
{ | ||
regex: /\och\b/i, | ||
languages: ['sv'] | ||
}, | ||
{ | ||
regex: /[ı|ğ|ș]/i, | ||
languages: ['tr'] | ||
}, | ||
{ | ||
regex: /[ă|ș|ț]/i, | ||
languages: ['ro'] | ||
}, | ||
{ | ||
regex: /[ă]/i, | ||
languages: ['vi'] | ||
}, | ||
{ | ||
regex: /[á|é|í|ó|ú]/i, | ||
languages: ['fr', 'es', 'it', 'cn', 'nl', 'fo', 'is', 'pt', 'vi', 'cy', 'el'] | ||
}, | ||
// { | ||
// regex: /[ñ]+/i, | ||
// languages: ['es', 'gn', 'gl'] | ||
// }, | ||
// { | ||
// regex: /[ü]+/i, | ||
@@ -16,6 +68,2 @@ // languages: ['es', 'tr', 'fr', 'hu', 'et', 'de', 'sv'] | ||
// { | ||
// regex: /[á|é|í|ó|ú]+/i, | ||
// languages: ['fr', 'es', 'it', 'cn', 'nl', 'fo', 'is', 'pt', 'vi', 'cy', 'el'] | ||
// }, | ||
// { | ||
// regex: /[â|ê|î]+/i, | ||
@@ -22,0 +70,0 @@ // languages: ['fr', 'it', 'pt', 'ro', 'ru', 'hr', 'tr', 'vi'] |
const detect = require('../src') | ||
const languageProfilesMock = require('./__mocks__/languageProfiles.json') | ||
test('detects english', () => { | ||
test('allows to pass custom profiled data', () => { | ||
expect(detect('hello worldy world', { | ||
@@ -14,7 +14,11 @@ languageProfiles: languageProfilesMock, | ||
test('detects english', () => { | ||
expect(detect("what's up dude")).toEqual({ | ||
language: 'en', | ||
country: '' | ||
}) | ||
}) | ||
test('detects spanish', () => { | ||
expect(detect('q tal tío', { | ||
languageProfiles: languageProfilesMock, | ||
reducers: [] | ||
})).toEqual({ | ||
expect(detect("que pasa tío")).toEqual({ | ||
language: 'es', | ||
@@ -21,0 +25,0 @@ country: '' |
test('it exports detector as default', () => { | ||
const detect = require('customisable-language-detection') | ||
const detect = require('../src') | ||
expect(detect).toBeDefined | ||
@@ -8,3 +8,3 @@ expect(typeof detect).toBe('function') | ||
test('it exports the base language profiles', () => { | ||
const profiles = require('customisable-language-detection').languageProfiles | ||
const profiles = require('../src').languageProfiles | ||
expect(profiles).toBeDefined | ||
@@ -15,3 +15,3 @@ expect(typeof profiles).toBe('object') | ||
test('it exports the base reducers', () => { | ||
const reducers = require('customisable-language-detection').reducers | ||
const reducers = require('../src').reducers | ||
expect(reducers).toBeDefined | ||
@@ -22,3 +22,3 @@ expect(Array.isArray(reducers)).toBe(true) | ||
test('it exports the reducer', () => { | ||
const reducer = require('customisable-language-detection').reducer | ||
const reducer = require('../src').reducer | ||
expect(reducer).toBeDefined | ||
@@ -29,3 +29,3 @@ expect(typeof reducer).toBe('function') | ||
test('it exports the profiler', () => { | ||
const profiler = require('customisable-language-detection').profiler | ||
const profiler = require('../src').profiler | ||
expect(profiler).toBeDefined | ||
@@ -36,3 +36,3 @@ expect(typeof profiler).toBe('function') | ||
test('it exports the trainer', () => { | ||
const trainer = require('customisable-language-detection').trainer | ||
const trainer = require('../src').trainer | ||
expect(trainer).toBeDefined | ||
@@ -39,0 +39,0 @@ expect(typeof trainer).toBe('function') |
const reduce = require('../src/reducer') | ||
test('reduces texts with vowel accents', () => { | ||
expect(reduce('some accént')).toEqual(['fr', 'es', 'it', 'cn', 'nl', 'fo', 'is', 'pt', 'vi', 'cy', 'el']) | ||
}) | ||
const assertTextReducesToLanguages = (text, expectedLanguages) => { | ||
return expect(reduce(text).sort()).toEqual(expectedLanguages.sort()) | ||
} | ||
test('reduces texts with ñ ignoring case', () => { | ||
expect(reduce('bua niÑo')).toEqual(['es', 'gn', 'gl']) | ||
assertTextReducesToLanguages('bua niÑo', ['es', 'gn', 'gl']) | ||
}) | ||
test('returns the smallest intersection set of matches', () => { | ||
expect(reduce('buá niño', [ | ||
{ | ||
regex: /[ñ]+/i, | ||
languages: ['es', 'gn', 'gl'] | ||
}, | ||
{ | ||
regex: /[á]+/i, | ||
languages: ['es'] | ||
} | ||
])).toEqual(['es']) | ||
test('Dutch if there is ik', () => { | ||
assertTextReducesToLanguages('Ik kan er nooit tegen als mensen me negeren.', ['nl']) | ||
assertTextReducesToLanguages('kan ik er nooit tegen als mensen me negeren.', ['nl']) | ||
// note that it should not catch ik within a word | ||
assertTextReducesToLanguages('fik er nooit tegen als mensen me negeren.', []) | ||
}) | ||
test('German if there is ich or a letter ß', () => { | ||
assertTextReducesToLanguages('Aha ich seh angeblich', ['de']) | ||
assertTextReducesToLanguages('Ich seh angeblich', ['de']) | ||
assertTextReducesToLanguages('bIch bich', []) | ||
}) | ||
test('German if there is the letter ß', () => { | ||
assertTextReducesToLanguages('seh angeblich süß aus', ['de']) | ||
}) | ||
test('Polish if there is czy', () => { | ||
assertTextReducesToLanguages('Czy mogłbym zasnąć w przedmieściach Twoich myśli?', ['pl']) | ||
}) | ||
test('Polish if there is letters Ł, ń, ś or ź', () => { | ||
assertTextReducesToLanguages('mogłbym Ła', ['pl']) | ||
assertTextReducesToLanguages('ńa', ['pl']) | ||
assertTextReducesToLanguages('śa', ['pl']) | ||
assertTextReducesToLanguages('źa', ['pl']) | ||
}) | ||
test('Scandinavian if there is a letter å', () => { | ||
assertTextReducesToLanguages('Så skal jeg bare finde ud', ['nb', 'nn', 'fo', 'is', 'da', 'sv']) | ||
}) | ||
test('Danish if there is af', () => { | ||
assertTextReducesToLanguages('skal jeg bare finde ud af', ['da']) | ||
}) | ||
test('Norwegian if there is nei', () => { | ||
assertTextReducesToLanguages('nei vi som har finale ', ['nb', 'nn']) | ||
}) | ||
test('Swedish if there is och', () => { | ||
assertTextReducesToLanguages('fb och du tog', ['sv']) | ||
}) | ||
test('Turkish if there is a letter ı (i without point) or ğ or ş', () => { | ||
assertTextReducesToLanguages('En büyük hatayı yaptım', ['tr']) | ||
assertTextReducesToLanguages('Çok doğru', ['tr']) | ||
}) | ||
test('Romanian if there is a letter ă or ș or ț (although ă is also used in Vietnamese as ş is in Turkish)', () => { | ||
assertTextReducesToLanguages('Încântat de șa cunoștință', ['ro', 'tr', 'vi']) | ||
}) |
Sorry, the diff of this file is too big to display
10343206
5558