languagedetect - npm Package Compare versions

.npmignore

.travis.yml

data/lang.json

data/unicode_blocks.json

2

lib/LanguageDetect.js

		@@ -32,3 +32,3 @@ /**
		var php = require('./phpjs')
		, db_lang = require('../data/lang')
		, db_lang = require('../data/lang.json')
		, Parser = require('./Parser');
		@@ -35,0 +35,0 @@

127

lib/Parser.js

		@@ -1,2 +0,2 @@
		var db_unicode_blocks = require('../data/unicode_blocks');
		var db_unicode_blocks = require('../data/unicode_blocks.json');
		var php = require('../lib/phpjs');
		@@ -37,6 +37,6 @@
		this._unicode_blocks = {};


		/**
		* Whether the parser should compile the unicode ranges
		*
		*
		* @access private
		@@ -87,3 +87,3 @@ * @var bool
		*/
		this._string = string;
		this._string = string ? string.replace(/[`~!@#$%^&*()_\|+\-=?;:'",.<>\{\}\[\]\\\/]/g, '') : '';
		};
		@@ -141,3 +141,3 @@

		// for a reference, see
		// for a reference, see
		// http://www.unicode.org/Public/UNIDATA/Blocks.txt
		@@ -179,6 +179,6 @@

		// failed to find the block
		// failed to find the block
		return -1;

		// todo: differentiate when it's out of range or when it falls
		// todo: differentiate when it's out of range or when it falls
		// into an unassigned range?
		@@ -194,4 +194,4 @@ }
		* Executes the parsing operation
		*
		* Be sure to call the set*() functions to set options and the
		*
		* Be sure to call the set*() functions to set options and the
		* prepare*() functions first to tell it what kind of data to compute
		@@ -240,3 +240,3 @@ *
		if (!this._trigram_pad_start) {
		tmp = this._next_char(this._string, byte_counter, true);
		tmp = this._next_char(this._string, byte_counter);
		byte_counter = tmp[0];
		@@ -246,3 +246,3 @@ a = tmp[1];
		if (a != ' ') {
		tmp = this._next_char(this._string, byte_counter, true);
		tmp = this._next_char(this._string, byte_counter);
		byte_counter = tmp[0];
		@@ -263,3 +263,3 @@ b = tmp[1];

		tmp = this._next_char(this._string, byte_counter, true);
		tmp = this._next_char(this._string, byte_counter);
		byte_counter = tmp[0];
		@@ -364,3 +364,3 @@ _char = tmp[1];
		* Will get the next character starting from $counter, which will then be
		* incremented. If a multi-byte char the bytes will be concatenated and
		* incremented. If a multi-byte char the bytes will be concatenated and
		* $counter will be incremeted by the number of bytes in the char.
		@@ -374,90 +374,5 @@ *
		*/
		, _next_char: function(str, counter, _special_convert){
		var special_convert = _special_convert \|\| false;

		//console.log('counter=', counter);
		var _char = str.charAt(counter++);
		//console.log('char=', _char)
		//console.log('counter=', counter);
		var ord = php.ord(_char);
		//console.log('ord=', ord);

		// for a description of the utf8 system see
		// http://www.phpclasses.org/browse/file/5131.html

		// normal ascii one byte char
		if (ord <= 127) {
		// special conversions needed for this package
		// (that only apply to regular ascii characters)
		// lower case, and convert all non-alphanumeric characters
		// other than "'" to space
		if (special_convert && _char != ' ' && _char != "'") {
		if (ord >= 65 && ord <= 90) { // A-Z
		_char = php.chr(ord + 32); // lower case
		} else if (ord < 97 \|\| ord > 122) { // NOT a-z
		_char = ' '; // convert to space
		}
		}

		return [counter, _char];

		// multi-byte chars
		} else if (ord >> 5 == 6) { // two-byte char
		nextchar = str[counter++]; // get next byte

		// lower-casing of non-ascii characters is still incomplete

		if (special_convert) {
		// lower case latin accented characters
		if (ord == 195) {
		nextord = php.ord(nextchar);
		nextord_adj = nextord + 64;
		// for a reference, see
		// http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html

		// À - Þ but not ×
		if ( nextord_adj >= 192
		&& nextord_adj <= 222
		&& nextord_adj != 215) {

		nextchar = php.chr(nextord + 32);
		}

		// lower case cyrillic alphabet
		} else if (ord == 208) {
		nextord = php.ord(nextchar);
		// if A - Pe
		if (nextord >= 144 && nextord <= 159) {
		// lower case
		nextchar = php.chr(nextord + 32);

		// if Er - Ya
		} else if (nextord >= 160 && nextord <= 175) {
		// lower case
		_char = php.chr(209); // == $ord++
		nextchar = php.chr(nextord - 32);
		}
		}
		}

		// tag on next byte
		return [counter, _char + nextchar];

		} else if (ord >> 4 == 14) { // three-byte char

		// tag on next 2 bytes
		var ret = _char + str[counter++] + str[counter++];
		return [counter, ret];

		} else if (ord >> 3 == 30) { // four-byte _char

		// tag on next 3 bytes
		var ret = _char + str[counter++] + str[counter++] + str[counter++];
		return [counter, ret];

		} else {
		// error?
		// FIXME
		return [counter, ' '];
		}
		, _next_char: function(str, counter){
		var _char = str.charAt(counter++);
		return [counter, _char.toLowerCase()];
		}
		@@ -494,3 +409,3 @@
		// 3 byte unicode
		// 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx
		// 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx
		var z = (php.ord(_char[0]) &0x0000000F) << 12;
		@@ -513,3 +428,3 @@ var x1 = (php.ord(_char[1]) & 0x0000003F) << 6;

		default:
		default:
		// error: malformatted char?
		@@ -522,3 +437,3 @@ return -1;
		* Sorts an array by value breaking ties alphabetically
		*
		*
		* @access private
		@@ -563,3 +478,3 @@ * @param array &$arr the array to sort
		* @access protected
		* @param array $arr array of trgram
		* @param array $arr array of trgram
		* @return array ranks of trigrams
		@@ -594,3 +509,3 @@ */
		*
		* Callback function for usort().
		* Callback function for usort().
		*
		@@ -597,0 +512,0 @@ * @access private

16

package.json

		@@ -5,5 +5,4 @@ {
		"keywords": ["n-gram", "language", "language detection"],
		"version": "0.0.1",
		"version": "1.0.0",
		"homepage": "http://blog.fgribreau.com/2011/07/week-end-project-nodejs-language.html",
		"devDependencies": { "nodeunit": ">= 0.5.1" },
		"author" : "Francois-Guillaume Ribreau <npm@fgribreau.com> (http://fgribreau.com)",
		@@ -18,3 +17,12 @@ "main" : "index",
		],
		"engines": { "node": "*" }
		}
		"scripts": {
		"test": "node_modules/nodeunit/bin/nodeunit test/*.test.js"
		},
		"contributors": [
		"chaoser <ruslan.zavackiy@gmail.com> (https://github.com/Chaoser)"
		],
		"devDependencies":{
		"nodeunit":">= 0.5.1"
		},
		"engines": { "node": ">= 0.4.8" }
		}

4

README.md

		@@ -0,1 +1,2 @@
		# Node Language Detect [![Build Status](https://secure.travis-ci.org/FGRibreau/node-language-detect.png)](http://travis-ci.org/FGRibreau/node-language-detect) #
		`LanguageDetect` is a port of the [PEAR::Text_LanguageDetect](http://pear.php.net/package/Text_LanguageDetect) for [node.js](http://nodejs.org).
		@@ -22,3 +23,2 @@
		/*
		Will print:
		[ [ 'english', 0.5969230769230769 ],
		@@ -38,3 +38,3 @@ [ 'hungarian', 0.407948717948718 ],

		// Only get first 2 results
		// Only get the first 2 results
		console.log(lngDetector.detect('This is a test.', 2));
		@@ -41,0 +41,0 @@

36

test/LanguageDetect.test.js

		@@ -22,23 +22,23 @@ (function() {
		t.equal(a._distance(a._lang_db['arabic'], trigram_freqs), 42900);
		t.equal(a._distance(a._lang_db['azeri'], trigram_freqs), 41727);
		t.equal(a._distance(a._lang_db['azeri'], trigram_freqs), 41739);
		t.equal(a._distance(a._lang_db['bengali'], trigram_freqs), 42900);
		t.equal(a._distance(a._lang_db['bulgarian'], trigram_freqs), 42900);
		t.equal(a._distance(a._lang_db['cebuano'], trigram_freqs), 40041);
		t.equal(a._distance(a._lang_db['croatian'], trigram_freqs), 37103);
		t.equal(a._distance(a._lang_db['czech'], trigram_freqs), 39100);
		t.equal(a._distance(a._lang_db['danish'], trigram_freqs), 35334);
		t.equal(a._distance(a._lang_db['dutch'], trigram_freqs), 37691);
		t.equal(a._distance(a._lang_db['english'], trigram_freqs), 27435);
		t.equal(a._distance(a._lang_db['estonian'], trigram_freqs), 37512);
		t.equal(a._distance(a._lang_db['cebuano'], trigram_freqs), 40051);
		t.equal(a._distance(a._lang_db['croatian'], trigram_freqs), 37390);
		t.equal(a._distance(a._lang_db['czech'], trigram_freqs), 39284);
		t.equal(a._distance(a._lang_db['danish'], trigram_freqs), 35149);
		t.equal(a._distance(a._lang_db['dutch'], trigram_freqs), 37838);
		t.equal(a._distance(a._lang_db['english'], trigram_freqs), 27607);
		t.equal(a._distance(a._lang_db['estonian'], trigram_freqs), 37536);
		t.equal(a._distance(a._lang_db['farsi'], trigram_freqs), 42900);
		t.equal(a._distance(a._lang_db['finnish'], trigram_freqs), 38619);
		t.equal(a._distance(a._lang_db['french'], trigram_freqs), 34141);
		t.equal(a._distance(a._lang_db['german'], trigram_freqs), 37005);
		t.equal(a._distance(a._lang_db['hausa'], trigram_freqs), 40622);
		t.equal(a._distance(a._lang_db['hawaiian'], trigram_freqs), 40878);
		t.equal(a._distance(a._lang_db['finnish'], trigram_freqs), 38637);
		t.equal(a._distance(a._lang_db['french'], trigram_freqs), 34185);
		t.equal(a._distance(a._lang_db['german'], trigram_freqs), 37030);
		t.equal(a._distance(a._lang_db['hausa'], trigram_freqs), 40827);
		t.equal(a._distance(a._lang_db['hawaiian'], trigram_freqs), 40890);
		t.equal(a._distance(a._lang_db['hindi'], trigram_freqs), 42900);
		t.equal(a._distance(a._lang_db['hungarian'], trigram_freqs), 37880);
		t.equal(a._distance(a._lang_db['icelandic'], trigram_freqs), 39340);
		t.equal(a._distance(a._lang_db['indonesian'], trigram_freqs), 40286);
		t.equal(a._distance(a._lang_db['italian'], trigram_freqs), 34882);
		t.equal(a._distance(a._lang_db['hungarian'], trigram_freqs), 37891);
		t.equal(a._distance(a._lang_db['icelandic'], trigram_freqs), 39345);
		t.equal(a._distance(a._lang_db['indonesian'], trigram_freqs), 40298);
		t.equal(a._distance(a._lang_db['italian'], trigram_freqs), 34749);
		t.equal(a._distance(a._lang_db['kazakh'], trigram_freqs), 42900);
		@@ -75,5 +75,5 @@ return t.done();
		r = l.detect(tweet);
		t.deepEqual(r[0], ['english', 0.3604895104895105]);
		t.deepEqual(r[0], ['english', 0.35648018648018653]);
		return t.done();
		};
		}).call(this);

287

test/Parser.test.js

		@@ -693,146 +693,145 @@ (function() {
		l.analyze();
		t.deepEqual(l.getTrigramRanks(), {
		"ion": 0,
		"on ": 1,
		" so": 2,
		"ess": 3,
		"hou": 4,
		"n s": 5,
		"oul": 6,
		"re ": 7,
		"tio": 8,
		"ust": 9,
		" a ": 10,
		" al": 11,
		" ar": 12,
		" b ": 13,
		" bo": 14,
		" do": 15,
		" ex": 16,
		" fo": 17,
		" fr": 18,
		" go": 19,
		" ho": 20,
		" if": 21,
		" in": 22,
		" ju": 23,
		" li": 24,
		" me": 25,
		" pv": 26,
		" se": 27,
		" sh": 28,
		" st": 29,
		" sw": 30,
		" th": 31,
		" to": 32,
		" u ": 33,
		" wh": 34,
		"a s": 35,
		"alw": 36,
		"are": 37,
		"at ": 38,
		"ati": 39,
		"atu": 40,
		"ays": 41,
		"b l": 42,
		"bot": 43,
		"d a": 44,
		"d s": 45,
		"don": 46,
		"e g": 47,
		"e i": 48,
		"e s": 49,
		"elf": 50,
		"ent": 51,
		"ers": 52,
		"exp": 53,
		"f e": 54,
		"f t": 55,
		"fol": 56,
		"fro": 57,
		"goo": 58,
		"hat": 59,
		"her": 60,
		"hom": 61,
		"hos": 62,
		"if ": 63,
		"imi": 64,
		"int": 65,
		"itl": 66,
		"jus": 67,
		"l w": 68,
		"ld ": 69,
		"les": 70,
		"lf ": 71,
		"lim": 72,
		"llo": 73,
		"low": 74,
		"lwa": 75,
		"m s": 76,
		"me ": 77,
		"mit": 78,
		"mor": 79,
		"n t": 80,
		"nat": 81,
		"ns ": 82,
		"nt ": 83,
		"nte": 84,
		"nti": 85,
		"od ": 86,
		"oll": 87,
		"om ": 88,
		"omo": 89,
		"ons": 90,
		"ont": 91,
		"ood": 92,
		"oph": 93,
		"ore": 94,
		"ose": 95,
		"oth": 96,
		"ous": 97,
		"ow ": 98,
		"pho": 99,
		"pre": 100,
		"pvn": 101,
		"res": 102,
		"rom": 103,
		"rs ": 104,
		"s a": 105,
		"s b": 106,
		"s i": 107,
		"s j": 108,
		"s u": 109,
		"se ": 110,
		"sel": 111,
		"sho": 112,
		"sio": 113,
		"sop": 114,
		"sou": 115,
		"ss ": 116,
		"ssi": 117,
		"st ": 118,
		"sta": 119,
		"sto": 120,
		"sw ": 121,
		"t a": 122,
		"t b": 123,
		"t f": 124,
		"tat": 125,
		"ten": 126,
		"tha": 127,
		"the": 128,
		"tle": 129,
		"to ": 130,
		"ton": 131,
		"tus": 132,
		"ul ": 133,
		"uld": 134,
		"us ": 135,
		"vna": 136,
		"w h": 137,
		"w m": 138,
		"way": 139,
		"who": 140,
		"xpr": 141,
		"ys ": 142
		t.deepEqual(l.getTrigramRanks(), { ion: 0,
		'on ': 1,
		' so': 2,
		ess: 3,
		hou: 4,
		'n s': 5,
		oul: 6,
		're ': 7,
		tio: 8,
		ust: 9,
		' a ': 10,
		' al': 11,
		' ar': 12,
		' b ': 13,
		' bo': 14,
		' fo': 15,
		' fr': 16,
		' go': 17,
		' ho': 18,
		' if': 19,
		' in': 20,
		' ju': 21,
		' li': 22,
		' me': 23,
		' pv': 24,
		' se': 25,
		' sh': 26,
		' st': 27,
		' sw': 28,
		' th': 29,
		' to': 30,
		' ud': 31,
		' wh': 32,
		'a s': 33,
		alw: 34,
		are: 35,
		'at ': 36,
		ati: 37,
		atu: 38,
		ays: 39,
		'b l': 40,
		bot: 41,
		'd a': 42,
		'd s': 43,
		don: 44,
		'e g': 45,
		'e i': 46,
		'e s': 47,
		elf: 48,
		ent: 49,
		ers: 50,
		exp: 51,
		'f t': 52,
		fex: 53,
		fol: 54,
		fro: 55,
		goo: 56,
		hat: 57,
		her: 58,
		hom: 59,
		hos: 60,
		'if ': 61,
		imi: 62,
		int: 63,
		itl: 64,
		jus: 65,
		'l w': 66,
		'ld ': 67,
		les: 68,
		lfe: 69,
		lim: 70,
		llo: 71,
		low: 72,
		lwa: 73,
		'm s': 74,
		'me ': 75,
		mit: 76,
		mor: 77,
		'n t': 78,
		nat: 79,
		'ns ': 80,
		'nt ': 81,
		nte: 82,
		nti: 83,
		'o p': 84,
		'od ': 85,
		oll: 86,
		'om ': 87,
		omo: 88,
		ons: 89,
		ont: 90,
		ood: 91,
		oph: 92,
		ore: 93,
		ose: 94,
		oth: 95,
		ous: 96,
		'ow ': 97,
		pho: 98,
		pre: 99,
		pvn: 100,
		res: 101,
		rom: 102,
		'rs ': 103,
		's a': 104,
		's b': 105,
		's i': 106,
		's j': 107,
		's u': 108,
		'se ': 109,
		sel: 110,
		sho: 111,
		sio: 112,
		sop: 113,
		sou: 114,
		'ss ': 115,
		ssi: 116,
		'st ': 117,
		sta: 118,
		sto: 119,
		'sw ': 120,
		't a': 121,
		't b': 122,
		't f': 123,
		tat: 124,
		ten: 125,
		tha: 126,
		the: 127,
		tle: 128,
		'to ': 129,
		ton: 130,
		tus: 131,
		udo: 132,
		'ul ': 133,
		uld: 134,
		'us ': 135,
		vna: 136,
		'w h': 137,
		'w m': 138,
		way: 139,
		who: 140,
		xpr: 141,
		'ys ': 142
		});
		@@ -839,0 +838,0 @@ return t.done();

data/lang.dat

data/lang.js

data/unicode_blocks.dat

data/unicode_blocks.js

languagedetect - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics