languagedetect
Advanced tools
Comparing version 0.0.1 to 1.0.0
@@ -32,3 +32,3 @@ /** | ||
var php = require('./phpjs') | ||
, db_lang = require('../data/lang') | ||
, db_lang = require('../data/lang.json') | ||
, Parser = require('./Parser'); | ||
@@ -35,0 +35,0 @@ |
@@ -1,2 +0,2 @@ | ||
var db_unicode_blocks = require('../data/unicode_blocks'); | ||
var db_unicode_blocks = require('../data/unicode_blocks.json'); | ||
var php = require('../lib/phpjs'); | ||
@@ -37,6 +37,6 @@ | ||
this._unicode_blocks = {}; | ||
/** | ||
* Whether the parser should compile the unicode ranges | ||
* | ||
* | ||
* @access private | ||
@@ -87,3 +87,3 @@ * @var bool | ||
*/ | ||
this._string = string; | ||
this._string = string ? string.replace(/[`~!@#$%^&*()_|+\-=?;:'",.<>\{\}\[\]\\\/]/g, '') : ''; | ||
}; | ||
@@ -141,3 +141,3 @@ | ||
// for a reference, see | ||
// for a reference, see | ||
// http://www.unicode.org/Public/UNIDATA/Blocks.txt | ||
@@ -179,6 +179,6 @@ | ||
// failed to find the block | ||
// failed to find the block | ||
return -1; | ||
// todo: differentiate when it's out of range or when it falls | ||
// todo: differentiate when it's out of range or when it falls | ||
// into an unassigned range? | ||
@@ -194,4 +194,4 @@ } | ||
* Executes the parsing operation | ||
* | ||
* Be sure to call the set*() functions to set options and the | ||
* | ||
* Be sure to call the set*() functions to set options and the | ||
* prepare*() functions first to tell it what kind of data to compute | ||
@@ -240,3 +240,3 @@ * | ||
if (!this._trigram_pad_start) { | ||
tmp = this._next_char(this._string, byte_counter, true); | ||
tmp = this._next_char(this._string, byte_counter); | ||
byte_counter = tmp[0]; | ||
@@ -246,3 +246,3 @@ a = tmp[1]; | ||
if (a != ' ') { | ||
tmp = this._next_char(this._string, byte_counter, true); | ||
tmp = this._next_char(this._string, byte_counter); | ||
byte_counter = tmp[0]; | ||
@@ -263,3 +263,3 @@ b = tmp[1]; | ||
tmp = this._next_char(this._string, byte_counter, true); | ||
tmp = this._next_char(this._string, byte_counter); | ||
byte_counter = tmp[0]; | ||
@@ -364,3 +364,3 @@ _char = tmp[1]; | ||
* Will get the next character starting from $counter, which will then be | ||
* incremented. If a multi-byte char the bytes will be concatenated and | ||
* incremented. If a multi-byte char the bytes will be concatenated and | ||
* $counter will be incremeted by the number of bytes in the char. | ||
@@ -374,90 +374,5 @@ * | ||
*/ | ||
, _next_char: function(str, counter, _special_convert){ | ||
var special_convert = _special_convert || false; | ||
//console.log('counter=', counter); | ||
var _char = str.charAt(counter++); | ||
//console.log('char=', _char) | ||
//console.log('counter=', counter); | ||
var ord = php.ord(_char); | ||
//console.log('ord=', ord); | ||
// for a description of the utf8 system see | ||
// http://www.phpclasses.org/browse/file/5131.html | ||
// normal ascii one byte char | ||
if (ord <= 127) { | ||
// special conversions needed for this package | ||
// (that only apply to regular ascii characters) | ||
// lower case, and convert all non-alphanumeric characters | ||
// other than "'" to space | ||
if (special_convert && _char != ' ' && _char != "'") { | ||
if (ord >= 65 && ord <= 90) { // A-Z | ||
_char = php.chr(ord + 32); // lower case | ||
} else if (ord < 97 || ord > 122) { // NOT a-z | ||
_char = ' '; // convert to space | ||
} | ||
} | ||
return [counter, _char]; | ||
// multi-byte chars | ||
} else if (ord >> 5 == 6) { // two-byte char | ||
nextchar = str[counter++]; // get next byte | ||
// lower-casing of non-ascii characters is still incomplete | ||
if (special_convert) { | ||
// lower case latin accented characters | ||
if (ord == 195) { | ||
nextord = php.ord(nextchar); | ||
nextord_adj = nextord + 64; | ||
// for a reference, see | ||
// http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html | ||
// À - Þ but not × | ||
if ( nextord_adj >= 192 | ||
&& nextord_adj <= 222 | ||
&& nextord_adj != 215) { | ||
nextchar = php.chr(nextord + 32); | ||
} | ||
// lower case cyrillic alphabet | ||
} else if (ord == 208) { | ||
nextord = php.ord(nextchar); | ||
// if A - Pe | ||
if (nextord >= 144 && nextord <= 159) { | ||
// lower case | ||
nextchar = php.chr(nextord + 32); | ||
// if Er - Ya | ||
} else if (nextord >= 160 && nextord <= 175) { | ||
// lower case | ||
_char = php.chr(209); // == $ord++ | ||
nextchar = php.chr(nextord - 32); | ||
} | ||
} | ||
} | ||
// tag on next byte | ||
return [counter, _char + nextchar]; | ||
} else if (ord >> 4 == 14) { // three-byte char | ||
// tag on next 2 bytes | ||
var ret = _char + str[counter++] + str[counter++]; | ||
return [counter, ret]; | ||
} else if (ord >> 3 == 30) { // four-byte _char | ||
// tag on next 3 bytes | ||
var ret = _char + str[counter++] + str[counter++] + str[counter++]; | ||
return [counter, ret]; | ||
} else { | ||
// error? | ||
// FIXME | ||
return [counter, ' ']; | ||
} | ||
, _next_char: function(str, counter){ | ||
var _char = str.charAt(counter++); | ||
return [counter, _char.toLowerCase()]; | ||
} | ||
@@ -494,3 +409,3 @@ | ||
// 3 byte unicode | ||
// 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx | ||
// 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx | ||
var z = (php.ord(_char[0]) &0x0000000F) << 12; | ||
@@ -513,3 +428,3 @@ var x1 = (php.ord(_char[1]) & 0x0000003F) << 6; | ||
default: | ||
default: | ||
// error: malformatted char? | ||
@@ -522,3 +437,3 @@ return -1; | ||
* Sorts an array by value breaking ties alphabetically | ||
* | ||
* | ||
* @access private | ||
@@ -563,3 +478,3 @@ * @param array &$arr the array to sort | ||
* @access protected | ||
* @param array $arr array of trgram | ||
* @param array $arr array of trgram | ||
* @return array ranks of trigrams | ||
@@ -594,3 +509,3 @@ */ | ||
* | ||
* Callback function for usort(). | ||
* Callback function for usort(). | ||
* | ||
@@ -597,0 +512,0 @@ * @access private |
@@ -5,5 +5,4 @@ { | ||
"keywords": ["n-gram", "language", "language detection"], | ||
"version": "0.0.1", | ||
"version": "1.0.0", | ||
"homepage": "http://blog.fgribreau.com/2011/07/week-end-project-nodejs-language.html", | ||
"devDependencies": { "nodeunit": ">= 0.5.1" }, | ||
"author" : "Francois-Guillaume Ribreau <npm@fgribreau.com> (http://fgribreau.com)", | ||
@@ -18,3 +17,12 @@ "main" : "index", | ||
], | ||
"engines": { "node": "*" } | ||
} | ||
"scripts": { | ||
"test": "node_modules/nodeunit/bin/nodeunit test/*.test.js" | ||
}, | ||
"contributors": [ | ||
"chaoser <ruslan.zavackiy@gmail.com> (https://github.com/Chaoser)" | ||
], | ||
"devDependencies":{ | ||
"nodeunit":">= 0.5.1" | ||
}, | ||
"engines": { "node": ">= 0.4.8" } | ||
} |
@@ -0,1 +1,2 @@ | ||
# Node Language Detect [![Build Status](https://secure.travis-ci.org/FGRibreau/node-language-detect.png)](http://travis-ci.org/FGRibreau/node-language-detect) # | ||
`LanguageDetect` is a port of the [PEAR::Text_LanguageDetect](http://pear.php.net/package/Text_LanguageDetect) for [node.js](http://nodejs.org). | ||
@@ -22,3 +23,2 @@ | ||
/* | ||
Will print: | ||
[ [ 'english', 0.5969230769230769 ], | ||
@@ -38,3 +38,3 @@ [ 'hungarian', 0.407948717948718 ], | ||
// Only get first 2 results | ||
// Only get the first 2 results | ||
console.log(lngDetector.detect('This is a test.', 2)); | ||
@@ -41,0 +41,0 @@ |
@@ -22,23 +22,23 @@ (function() { | ||
t.equal(a._distance(a._lang_db['arabic'], trigram_freqs), 42900); | ||
t.equal(a._distance(a._lang_db['azeri'], trigram_freqs), 41727); | ||
t.equal(a._distance(a._lang_db['azeri'], trigram_freqs), 41739); | ||
t.equal(a._distance(a._lang_db['bengali'], trigram_freqs), 42900); | ||
t.equal(a._distance(a._lang_db['bulgarian'], trigram_freqs), 42900); | ||
t.equal(a._distance(a._lang_db['cebuano'], trigram_freqs), 40041); | ||
t.equal(a._distance(a._lang_db['croatian'], trigram_freqs), 37103); | ||
t.equal(a._distance(a._lang_db['czech'], trigram_freqs), 39100); | ||
t.equal(a._distance(a._lang_db['danish'], trigram_freqs), 35334); | ||
t.equal(a._distance(a._lang_db['dutch'], trigram_freqs), 37691); | ||
t.equal(a._distance(a._lang_db['english'], trigram_freqs), 27435); | ||
t.equal(a._distance(a._lang_db['estonian'], trigram_freqs), 37512); | ||
t.equal(a._distance(a._lang_db['cebuano'], trigram_freqs), 40051); | ||
t.equal(a._distance(a._lang_db['croatian'], trigram_freqs), 37390); | ||
t.equal(a._distance(a._lang_db['czech'], trigram_freqs), 39284); | ||
t.equal(a._distance(a._lang_db['danish'], trigram_freqs), 35149); | ||
t.equal(a._distance(a._lang_db['dutch'], trigram_freqs), 37838); | ||
t.equal(a._distance(a._lang_db['english'], trigram_freqs), 27607); | ||
t.equal(a._distance(a._lang_db['estonian'], trigram_freqs), 37536); | ||
t.equal(a._distance(a._lang_db['farsi'], trigram_freqs), 42900); | ||
t.equal(a._distance(a._lang_db['finnish'], trigram_freqs), 38619); | ||
t.equal(a._distance(a._lang_db['french'], trigram_freqs), 34141); | ||
t.equal(a._distance(a._lang_db['german'], trigram_freqs), 37005); | ||
t.equal(a._distance(a._lang_db['hausa'], trigram_freqs), 40622); | ||
t.equal(a._distance(a._lang_db['hawaiian'], trigram_freqs), 40878); | ||
t.equal(a._distance(a._lang_db['finnish'], trigram_freqs), 38637); | ||
t.equal(a._distance(a._lang_db['french'], trigram_freqs), 34185); | ||
t.equal(a._distance(a._lang_db['german'], trigram_freqs), 37030); | ||
t.equal(a._distance(a._lang_db['hausa'], trigram_freqs), 40827); | ||
t.equal(a._distance(a._lang_db['hawaiian'], trigram_freqs), 40890); | ||
t.equal(a._distance(a._lang_db['hindi'], trigram_freqs), 42900); | ||
t.equal(a._distance(a._lang_db['hungarian'], trigram_freqs), 37880); | ||
t.equal(a._distance(a._lang_db['icelandic'], trigram_freqs), 39340); | ||
t.equal(a._distance(a._lang_db['indonesian'], trigram_freqs), 40286); | ||
t.equal(a._distance(a._lang_db['italian'], trigram_freqs), 34882); | ||
t.equal(a._distance(a._lang_db['hungarian'], trigram_freqs), 37891); | ||
t.equal(a._distance(a._lang_db['icelandic'], trigram_freqs), 39345); | ||
t.equal(a._distance(a._lang_db['indonesian'], trigram_freqs), 40298); | ||
t.equal(a._distance(a._lang_db['italian'], trigram_freqs), 34749); | ||
t.equal(a._distance(a._lang_db['kazakh'], trigram_freqs), 42900); | ||
@@ -75,5 +75,5 @@ return t.done(); | ||
r = l.detect(tweet); | ||
t.deepEqual(r[0], ['english', 0.3604895104895105]); | ||
t.deepEqual(r[0], ['english', 0.35648018648018653]); | ||
return t.done(); | ||
}; | ||
}).call(this); |
@@ -693,146 +693,145 @@ (function() { | ||
l.analyze(); | ||
t.deepEqual(l.getTrigramRanks(), { | ||
"ion": 0, | ||
"on ": 1, | ||
" so": 2, | ||
"ess": 3, | ||
"hou": 4, | ||
"n s": 5, | ||
"oul": 6, | ||
"re ": 7, | ||
"tio": 8, | ||
"ust": 9, | ||
" a ": 10, | ||
" al": 11, | ||
" ar": 12, | ||
" b ": 13, | ||
" bo": 14, | ||
" do": 15, | ||
" ex": 16, | ||
" fo": 17, | ||
" fr": 18, | ||
" go": 19, | ||
" ho": 20, | ||
" if": 21, | ||
" in": 22, | ||
" ju": 23, | ||
" li": 24, | ||
" me": 25, | ||
" pv": 26, | ||
" se": 27, | ||
" sh": 28, | ||
" st": 29, | ||
" sw": 30, | ||
" th": 31, | ||
" to": 32, | ||
" u ": 33, | ||
" wh": 34, | ||
"a s": 35, | ||
"alw": 36, | ||
"are": 37, | ||
"at ": 38, | ||
"ati": 39, | ||
"atu": 40, | ||
"ays": 41, | ||
"b l": 42, | ||
"bot": 43, | ||
"d a": 44, | ||
"d s": 45, | ||
"don": 46, | ||
"e g": 47, | ||
"e i": 48, | ||
"e s": 49, | ||
"elf": 50, | ||
"ent": 51, | ||
"ers": 52, | ||
"exp": 53, | ||
"f e": 54, | ||
"f t": 55, | ||
"fol": 56, | ||
"fro": 57, | ||
"goo": 58, | ||
"hat": 59, | ||
"her": 60, | ||
"hom": 61, | ||
"hos": 62, | ||
"if ": 63, | ||
"imi": 64, | ||
"int": 65, | ||
"itl": 66, | ||
"jus": 67, | ||
"l w": 68, | ||
"ld ": 69, | ||
"les": 70, | ||
"lf ": 71, | ||
"lim": 72, | ||
"llo": 73, | ||
"low": 74, | ||
"lwa": 75, | ||
"m s": 76, | ||
"me ": 77, | ||
"mit": 78, | ||
"mor": 79, | ||
"n t": 80, | ||
"nat": 81, | ||
"ns ": 82, | ||
"nt ": 83, | ||
"nte": 84, | ||
"nti": 85, | ||
"od ": 86, | ||
"oll": 87, | ||
"om ": 88, | ||
"omo": 89, | ||
"ons": 90, | ||
"ont": 91, | ||
"ood": 92, | ||
"oph": 93, | ||
"ore": 94, | ||
"ose": 95, | ||
"oth": 96, | ||
"ous": 97, | ||
"ow ": 98, | ||
"pho": 99, | ||
"pre": 100, | ||
"pvn": 101, | ||
"res": 102, | ||
"rom": 103, | ||
"rs ": 104, | ||
"s a": 105, | ||
"s b": 106, | ||
"s i": 107, | ||
"s j": 108, | ||
"s u": 109, | ||
"se ": 110, | ||
"sel": 111, | ||
"sho": 112, | ||
"sio": 113, | ||
"sop": 114, | ||
"sou": 115, | ||
"ss ": 116, | ||
"ssi": 117, | ||
"st ": 118, | ||
"sta": 119, | ||
"sto": 120, | ||
"sw ": 121, | ||
"t a": 122, | ||
"t b": 123, | ||
"t f": 124, | ||
"tat": 125, | ||
"ten": 126, | ||
"tha": 127, | ||
"the": 128, | ||
"tle": 129, | ||
"to ": 130, | ||
"ton": 131, | ||
"tus": 132, | ||
"ul ": 133, | ||
"uld": 134, | ||
"us ": 135, | ||
"vna": 136, | ||
"w h": 137, | ||
"w m": 138, | ||
"way": 139, | ||
"who": 140, | ||
"xpr": 141, | ||
"ys ": 142 | ||
t.deepEqual(l.getTrigramRanks(), { ion: 0, | ||
'on ': 1, | ||
' so': 2, | ||
ess: 3, | ||
hou: 4, | ||
'n s': 5, | ||
oul: 6, | ||
're ': 7, | ||
tio: 8, | ||
ust: 9, | ||
' a ': 10, | ||
' al': 11, | ||
' ar': 12, | ||
' b ': 13, | ||
' bo': 14, | ||
' fo': 15, | ||
' fr': 16, | ||
' go': 17, | ||
' ho': 18, | ||
' if': 19, | ||
' in': 20, | ||
' ju': 21, | ||
' li': 22, | ||
' me': 23, | ||
' pv': 24, | ||
' se': 25, | ||
' sh': 26, | ||
' st': 27, | ||
' sw': 28, | ||
' th': 29, | ||
' to': 30, | ||
' ud': 31, | ||
' wh': 32, | ||
'a s': 33, | ||
alw: 34, | ||
are: 35, | ||
'at ': 36, | ||
ati: 37, | ||
atu: 38, | ||
ays: 39, | ||
'b l': 40, | ||
bot: 41, | ||
'd a': 42, | ||
'd s': 43, | ||
don: 44, | ||
'e g': 45, | ||
'e i': 46, | ||
'e s': 47, | ||
elf: 48, | ||
ent: 49, | ||
ers: 50, | ||
exp: 51, | ||
'f t': 52, | ||
fex: 53, | ||
fol: 54, | ||
fro: 55, | ||
goo: 56, | ||
hat: 57, | ||
her: 58, | ||
hom: 59, | ||
hos: 60, | ||
'if ': 61, | ||
imi: 62, | ||
int: 63, | ||
itl: 64, | ||
jus: 65, | ||
'l w': 66, | ||
'ld ': 67, | ||
les: 68, | ||
lfe: 69, | ||
lim: 70, | ||
llo: 71, | ||
low: 72, | ||
lwa: 73, | ||
'm s': 74, | ||
'me ': 75, | ||
mit: 76, | ||
mor: 77, | ||
'n t': 78, | ||
nat: 79, | ||
'ns ': 80, | ||
'nt ': 81, | ||
nte: 82, | ||
nti: 83, | ||
'o p': 84, | ||
'od ': 85, | ||
oll: 86, | ||
'om ': 87, | ||
omo: 88, | ||
ons: 89, | ||
ont: 90, | ||
ood: 91, | ||
oph: 92, | ||
ore: 93, | ||
ose: 94, | ||
oth: 95, | ||
ous: 96, | ||
'ow ': 97, | ||
pho: 98, | ||
pre: 99, | ||
pvn: 100, | ||
res: 101, | ||
rom: 102, | ||
'rs ': 103, | ||
's a': 104, | ||
's b': 105, | ||
's i': 106, | ||
's j': 107, | ||
's u': 108, | ||
'se ': 109, | ||
sel: 110, | ||
sho: 111, | ||
sio: 112, | ||
sop: 113, | ||
sou: 114, | ||
'ss ': 115, | ||
ssi: 116, | ||
'st ': 117, | ||
sta: 118, | ||
sto: 119, | ||
'sw ': 120, | ||
't a': 121, | ||
't b': 122, | ||
't f': 123, | ||
tat: 124, | ||
ten: 125, | ||
tha: 126, | ||
the: 127, | ||
tle: 128, | ||
'to ': 129, | ||
ton: 130, | ||
tus: 131, | ||
udo: 132, | ||
'ul ': 133, | ||
uld: 134, | ||
'us ': 135, | ||
vna: 136, | ||
'w h': 137, | ||
'w m': 138, | ||
way: 139, | ||
who: 140, | ||
xpr: 141, | ||
'ys ': 142 | ||
}); | ||
@@ -839,0 +838,0 @@ return t.done(); |
New author
Supply chain riskA new npm collaborator published a version of the package for the first time. New collaborators are usually benign additions to a project, but do indicate a change to the security surface area of a package.
Found 1 instance in 1 package
Non-existent author
Supply chain riskThe package was published by an npm account that no longer exists.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
No tests
QualityPackage does not have any tests. This is a strong signal of a poorly maintained or low quality package.
Found 1 instance in 1 package
No v1
QualityPackage is not semver >=1. This means it is not stable and does not support ^ ranges.
Found 1 instance in 1 package
0
0
0
446337
2212
1