Socket
Socket
Sign inDemoInstall

languagedetect

Package Overview
Dependencies
0
Maintainers
1
Versions
7
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 0.0.1 to 1.0.0

.npmignore

2

lib/LanguageDetect.js

@@ -32,3 +32,3 @@ /**

var php = require('./phpjs')
, db_lang = require('../data/lang')
, db_lang = require('../data/lang.json')
, Parser = require('./Parser');

@@ -35,0 +35,0 @@

@@ -1,2 +0,2 @@

var db_unicode_blocks = require('../data/unicode_blocks');
var db_unicode_blocks = require('../data/unicode_blocks.json');
var php = require('../lib/phpjs');

@@ -37,6 +37,6 @@

this._unicode_blocks = {};
/**
* Whether the parser should compile the unicode ranges
*
*
* @access private

@@ -87,3 +87,3 @@ * @var bool

*/
this._string = string;
this._string = string ? string.replace(/[`~!@#$%^&*()_|+\-=?;:'",.<>\{\}\[\]\\\/]/g, '') : '';
};

@@ -141,3 +141,3 @@

// for a reference, see
// for a reference, see
// http://www.unicode.org/Public/UNIDATA/Blocks.txt

@@ -179,6 +179,6 @@

// failed to find the block
// failed to find the block
return -1;
// todo: differentiate when it's out of range or when it falls
// todo: differentiate when it's out of range or when it falls
// into an unassigned range?

@@ -194,4 +194,4 @@ }

* Executes the parsing operation
*
* Be sure to call the set*() functions to set options and the
*
* Be sure to call the set*() functions to set options and the
* prepare*() functions first to tell it what kind of data to compute

@@ -240,3 +240,3 @@ *

if (!this._trigram_pad_start) {
tmp = this._next_char(this._string, byte_counter, true);
tmp = this._next_char(this._string, byte_counter);
byte_counter = tmp[0];

@@ -246,3 +246,3 @@ a = tmp[1];

if (a != ' ') {
tmp = this._next_char(this._string, byte_counter, true);
tmp = this._next_char(this._string, byte_counter);
byte_counter = tmp[0];

@@ -263,3 +263,3 @@ b = tmp[1];

tmp = this._next_char(this._string, byte_counter, true);
tmp = this._next_char(this._string, byte_counter);
byte_counter = tmp[0];

@@ -364,3 +364,3 @@ _char = tmp[1];

* Will get the next character starting from $counter, which will then be
* incremented. If a multi-byte char the bytes will be concatenated and
* incremented. If a multi-byte char the bytes will be concatenated and
* $counter will be incremeted by the number of bytes in the char.

@@ -374,90 +374,5 @@ *

*/
, _next_char: function(str, counter, _special_convert){
var special_convert = _special_convert || false;
//console.log('counter=', counter);
var _char = str.charAt(counter++);
//console.log('char=', _char)
//console.log('counter=', counter);
var ord = php.ord(_char);
//console.log('ord=', ord);
// for a description of the utf8 system see
// http://www.phpclasses.org/browse/file/5131.html
// normal ascii one byte char
if (ord <= 127) {
// special conversions needed for this package
// (that only apply to regular ascii characters)
// lower case, and convert all non-alphanumeric characters
// other than "'" to space
if (special_convert && _char != ' ' && _char != "'") {
if (ord >= 65 && ord <= 90) { // A-Z
_char = php.chr(ord + 32); // lower case
} else if (ord < 97 || ord > 122) { // NOT a-z
_char = ' '; // convert to space
}
}
return [counter, _char];
// multi-byte chars
} else if (ord >> 5 == 6) { // two-byte char
nextchar = str[counter++]; // get next byte
// lower-casing of non-ascii characters is still incomplete
if (special_convert) {
// lower case latin accented characters
if (ord == 195) {
nextord = php.ord(nextchar);
nextord_adj = nextord + 64;
// for a reference, see
// http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html
// &Agrave; - &THORN; but not &times;
if ( nextord_adj >= 192
&& nextord_adj <= 222
&& nextord_adj != 215) {
nextchar = php.chr(nextord + 32);
}
// lower case cyrillic alphabet
} else if (ord == 208) {
nextord = php.ord(nextchar);
// if A - Pe
if (nextord >= 144 && nextord <= 159) {
// lower case
nextchar = php.chr(nextord + 32);
// if Er - Ya
} else if (nextord >= 160 && nextord <= 175) {
// lower case
_char = php.chr(209); // == $ord++
nextchar = php.chr(nextord - 32);
}
}
}
// tag on next byte
return [counter, _char + nextchar];
} else if (ord >> 4 == 14) { // three-byte char
// tag on next 2 bytes
var ret = _char + str[counter++] + str[counter++];
return [counter, ret];
} else if (ord >> 3 == 30) { // four-byte _char
// tag on next 3 bytes
var ret = _char + str[counter++] + str[counter++] + str[counter++];
return [counter, ret];
} else {
// error?
// FIXME
return [counter, ' '];
}
, _next_char: function(str, counter){
var _char = str.charAt(counter++);
return [counter, _char.toLowerCase()];
}

@@ -494,3 +409,3 @@

// 3 byte unicode
// 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx
// 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx
var z = (php.ord(_char[0]) &0x0000000F) << 12;

@@ -513,3 +428,3 @@ var x1 = (php.ord(_char[1]) & 0x0000003F) << 6;

default:
default:
// error: malformatted char?

@@ -522,3 +437,3 @@ return -1;

* Sorts an array by value breaking ties alphabetically
*
*
* @access private

@@ -563,3 +478,3 @@ * @param array &$arr the array to sort

* @access protected
* @param array $arr array of trgram
* @param array $arr array of trgram
* @return array ranks of trigrams

@@ -594,3 +509,3 @@ */

*
* Callback function for usort().
* Callback function for usort().
*

@@ -597,0 +512,0 @@ * @access private

@@ -5,5 +5,4 @@ {

"keywords": ["n-gram", "language", "language detection"],
"version": "0.0.1",
"version": "1.0.0",
"homepage": "http://blog.fgribreau.com/2011/07/week-end-project-nodejs-language.html",
"devDependencies": { "nodeunit": ">= 0.5.1" },
"author" : "Francois-Guillaume Ribreau <npm@fgribreau.com> (http://fgribreau.com)",

@@ -18,3 +17,12 @@ "main" : "index",

],
"engines": { "node": "*" }
}
"scripts": {
"test": "node_modules/nodeunit/bin/nodeunit test/*.test.js"
},
"contributors": [
"chaoser <ruslan.zavackiy@gmail.com> (https://github.com/Chaoser)"
],
"devDependencies":{
"nodeunit":">= 0.5.1"
},
"engines": { "node": ">= 0.4.8" }
}

@@ -0,1 +1,2 @@

# Node Language Detect [![Build Status](https://secure.travis-ci.org/FGRibreau/node-language-detect.png)](http://travis-ci.org/FGRibreau/node-language-detect) #
`LanguageDetect` is a port of the [PEAR::Text_LanguageDetect](http://pear.php.net/package/Text_LanguageDetect) for [node.js](http://nodejs.org).

@@ -22,3 +23,2 @@

/*
Will print:
[ [ 'english', 0.5969230769230769 ],

@@ -38,3 +38,3 @@ [ 'hungarian', 0.407948717948718 ],

// Only get first 2 results
// Only get the first 2 results
console.log(lngDetector.detect('This is a test.', 2));

@@ -41,0 +41,0 @@

@@ -22,23 +22,23 @@ (function() {

t.equal(a._distance(a._lang_db['arabic'], trigram_freqs), 42900);
t.equal(a._distance(a._lang_db['azeri'], trigram_freqs), 41727);
t.equal(a._distance(a._lang_db['azeri'], trigram_freqs), 41739);
t.equal(a._distance(a._lang_db['bengali'], trigram_freqs), 42900);
t.equal(a._distance(a._lang_db['bulgarian'], trigram_freqs), 42900);
t.equal(a._distance(a._lang_db['cebuano'], trigram_freqs), 40041);
t.equal(a._distance(a._lang_db['croatian'], trigram_freqs), 37103);
t.equal(a._distance(a._lang_db['czech'], trigram_freqs), 39100);
t.equal(a._distance(a._lang_db['danish'], trigram_freqs), 35334);
t.equal(a._distance(a._lang_db['dutch'], trigram_freqs), 37691);
t.equal(a._distance(a._lang_db['english'], trigram_freqs), 27435);
t.equal(a._distance(a._lang_db['estonian'], trigram_freqs), 37512);
t.equal(a._distance(a._lang_db['cebuano'], trigram_freqs), 40051);
t.equal(a._distance(a._lang_db['croatian'], trigram_freqs), 37390);
t.equal(a._distance(a._lang_db['czech'], trigram_freqs), 39284);
t.equal(a._distance(a._lang_db['danish'], trigram_freqs), 35149);
t.equal(a._distance(a._lang_db['dutch'], trigram_freqs), 37838);
t.equal(a._distance(a._lang_db['english'], trigram_freqs), 27607);
t.equal(a._distance(a._lang_db['estonian'], trigram_freqs), 37536);
t.equal(a._distance(a._lang_db['farsi'], trigram_freqs), 42900);
t.equal(a._distance(a._lang_db['finnish'], trigram_freqs), 38619);
t.equal(a._distance(a._lang_db['french'], trigram_freqs), 34141);
t.equal(a._distance(a._lang_db['german'], trigram_freqs), 37005);
t.equal(a._distance(a._lang_db['hausa'], trigram_freqs), 40622);
t.equal(a._distance(a._lang_db['hawaiian'], trigram_freqs), 40878);
t.equal(a._distance(a._lang_db['finnish'], trigram_freqs), 38637);
t.equal(a._distance(a._lang_db['french'], trigram_freqs), 34185);
t.equal(a._distance(a._lang_db['german'], trigram_freqs), 37030);
t.equal(a._distance(a._lang_db['hausa'], trigram_freqs), 40827);
t.equal(a._distance(a._lang_db['hawaiian'], trigram_freqs), 40890);
t.equal(a._distance(a._lang_db['hindi'], trigram_freqs), 42900);
t.equal(a._distance(a._lang_db['hungarian'], trigram_freqs), 37880);
t.equal(a._distance(a._lang_db['icelandic'], trigram_freqs), 39340);
t.equal(a._distance(a._lang_db['indonesian'], trigram_freqs), 40286);
t.equal(a._distance(a._lang_db['italian'], trigram_freqs), 34882);
t.equal(a._distance(a._lang_db['hungarian'], trigram_freqs), 37891);
t.equal(a._distance(a._lang_db['icelandic'], trigram_freqs), 39345);
t.equal(a._distance(a._lang_db['indonesian'], trigram_freqs), 40298);
t.equal(a._distance(a._lang_db['italian'], trigram_freqs), 34749);
t.equal(a._distance(a._lang_db['kazakh'], trigram_freqs), 42900);

@@ -75,5 +75,5 @@ return t.done();

r = l.detect(tweet);
t.deepEqual(r[0], ['english', 0.3604895104895105]);
t.deepEqual(r[0], ['english', 0.35648018648018653]);
return t.done();
};
}).call(this);

@@ -693,146 +693,145 @@ (function() {

l.analyze();
t.deepEqual(l.getTrigramRanks(), {
"ion": 0,
"on ": 1,
" so": 2,
"ess": 3,
"hou": 4,
"n s": 5,
"oul": 6,
"re ": 7,
"tio": 8,
"ust": 9,
" a ": 10,
" al": 11,
" ar": 12,
" b ": 13,
" bo": 14,
" do": 15,
" ex": 16,
" fo": 17,
" fr": 18,
" go": 19,
" ho": 20,
" if": 21,
" in": 22,
" ju": 23,
" li": 24,
" me": 25,
" pv": 26,
" se": 27,
" sh": 28,
" st": 29,
" sw": 30,
" th": 31,
" to": 32,
" u ": 33,
" wh": 34,
"a s": 35,
"alw": 36,
"are": 37,
"at ": 38,
"ati": 39,
"atu": 40,
"ays": 41,
"b l": 42,
"bot": 43,
"d a": 44,
"d s": 45,
"don": 46,
"e g": 47,
"e i": 48,
"e s": 49,
"elf": 50,
"ent": 51,
"ers": 52,
"exp": 53,
"f e": 54,
"f t": 55,
"fol": 56,
"fro": 57,
"goo": 58,
"hat": 59,
"her": 60,
"hom": 61,
"hos": 62,
"if ": 63,
"imi": 64,
"int": 65,
"itl": 66,
"jus": 67,
"l w": 68,
"ld ": 69,
"les": 70,
"lf ": 71,
"lim": 72,
"llo": 73,
"low": 74,
"lwa": 75,
"m s": 76,
"me ": 77,
"mit": 78,
"mor": 79,
"n t": 80,
"nat": 81,
"ns ": 82,
"nt ": 83,
"nte": 84,
"nti": 85,
"od ": 86,
"oll": 87,
"om ": 88,
"omo": 89,
"ons": 90,
"ont": 91,
"ood": 92,
"oph": 93,
"ore": 94,
"ose": 95,
"oth": 96,
"ous": 97,
"ow ": 98,
"pho": 99,
"pre": 100,
"pvn": 101,
"res": 102,
"rom": 103,
"rs ": 104,
"s a": 105,
"s b": 106,
"s i": 107,
"s j": 108,
"s u": 109,
"se ": 110,
"sel": 111,
"sho": 112,
"sio": 113,
"sop": 114,
"sou": 115,
"ss ": 116,
"ssi": 117,
"st ": 118,
"sta": 119,
"sto": 120,
"sw ": 121,
"t a": 122,
"t b": 123,
"t f": 124,
"tat": 125,
"ten": 126,
"tha": 127,
"the": 128,
"tle": 129,
"to ": 130,
"ton": 131,
"tus": 132,
"ul ": 133,
"uld": 134,
"us ": 135,
"vna": 136,
"w h": 137,
"w m": 138,
"way": 139,
"who": 140,
"xpr": 141,
"ys ": 142
t.deepEqual(l.getTrigramRanks(), { ion: 0,
'on ': 1,
' so': 2,
ess: 3,
hou: 4,
'n s': 5,
oul: 6,
're ': 7,
tio: 8,
ust: 9,
' a ': 10,
' al': 11,
' ar': 12,
' b ': 13,
' bo': 14,
' fo': 15,
' fr': 16,
' go': 17,
' ho': 18,
' if': 19,
' in': 20,
' ju': 21,
' li': 22,
' me': 23,
' pv': 24,
' se': 25,
' sh': 26,
' st': 27,
' sw': 28,
' th': 29,
' to': 30,
' ud': 31,
' wh': 32,
'a s': 33,
alw: 34,
are: 35,
'at ': 36,
ati: 37,
atu: 38,
ays: 39,
'b l': 40,
bot: 41,
'd a': 42,
'd s': 43,
don: 44,
'e g': 45,
'e i': 46,
'e s': 47,
elf: 48,
ent: 49,
ers: 50,
exp: 51,
'f t': 52,
fex: 53,
fol: 54,
fro: 55,
goo: 56,
hat: 57,
her: 58,
hom: 59,
hos: 60,
'if ': 61,
imi: 62,
int: 63,
itl: 64,
jus: 65,
'l w': 66,
'ld ': 67,
les: 68,
lfe: 69,
lim: 70,
llo: 71,
low: 72,
lwa: 73,
'm s': 74,
'me ': 75,
mit: 76,
mor: 77,
'n t': 78,
nat: 79,
'ns ': 80,
'nt ': 81,
nte: 82,
nti: 83,
'o p': 84,
'od ': 85,
oll: 86,
'om ': 87,
omo: 88,
ons: 89,
ont: 90,
ood: 91,
oph: 92,
ore: 93,
ose: 94,
oth: 95,
ous: 96,
'ow ': 97,
pho: 98,
pre: 99,
pvn: 100,
res: 101,
rom: 102,
'rs ': 103,
's a': 104,
's b': 105,
's i': 106,
's j': 107,
's u': 108,
'se ': 109,
sel: 110,
sho: 111,
sio: 112,
sop: 113,
sou: 114,
'ss ': 115,
ssi: 116,
'st ': 117,
sta: 118,
sto: 119,
'sw ': 120,
't a': 121,
't b': 122,
't f': 123,
tat: 124,
ten: 125,
tha: 126,
the: 127,
tle: 128,
'to ': 129,
ton: 130,
tus: 131,
udo: 132,
'ul ': 133,
uld: 134,
'us ': 135,
vna: 136,
'w h': 137,
'w m': 138,
way: 139,
who: 140,
xpr: 141,
'ys ': 142
});

@@ -839,0 +838,0 @@ return t.done();

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc