Socket
Socket
Sign inDemoInstall

chinese-seg

Package Overview
Dependencies
6
Maintainers
1
Versions
7
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 0.0.0 to 0.0.1

lib/middleware/chn.js

3

dict/chn-dict2.txt

@@ -311,1 +311,4 @@ {"word":"工信处","props":{"机构团体":1},"freq":100}

{"word":"艘","props":{"数词":1},"freq":101}
{"word":"微博","props":{"名词":1},"freq":10000}
{"word":"微信","props":{"名词":1},"freq":10000}
{"word":"工作坊","props":{"名词":1},"freq":2000}

28

lib/middleware/punctuation.js
var __indexOf = [].indexOf || function(item) { for (var i = 0, l = this.length; i < l; i++) { if (i in this && this[i] === item) return i; } return -1; };
module.exports = function(options) {
var extend, fn, hanzenkaku, isToken, name, noop, path, stopwords, tokenizeSync, unique, _ref, _stopwords;
var extend, fn, hanzenkaku, isToken, latinAlphaPattern, name, noop, path, stopwords, tokenizeSync, unique, _ref, _stopwords;
hanzenkaku = require('hanzenkaku').HanZenKaku;

@@ -12,3 +12,8 @@ _ref = require('../utils'), noop = _ref.noop, path = _ref.path, extend = _ref.extend, unique = _ref.unique, tokenizeSync = _ref.tokenizeSync;

options.skipProps = unique([options.propName].concat(options.skipProps));
_stopwords = '\u3000 ,.;+-|/\\\'":?<>[]{}=!@#$%^&*()~`' + '。,、':∶;?‘’“”〝〞ˆˇ﹕︰﹔﹖﹑·¨….¸;!´?!~—ˉ|‖"〃`@﹫¡¿﹏﹋﹌︴々﹟#﹩$﹠&﹪%*﹡﹢﹦' + '﹤‐ ̄¯―﹨ˆ˜﹍﹎+=<­__-\ˇ~﹉﹊()〈〉‹›﹛﹜『』〖〗[]《》〔〕{}「」【】︵︷︿︹︽_﹁﹃︻︶︸' + '﹀︺︾ˉ﹂﹄︼+-×÷﹢﹣±/=≈≡≠∧∨∑∏∪∩∈⊙⌒⊥∥∠∽≌<>≤≥≮≯∧∨√﹙﹚[]﹛﹜∫∮∝∞⊙∏' + '┌┬┐┏┳┓╒╤╕─│├┼┤┣╋┫╞╪╡━┃└┴┘┗┻┛╘╧╛┄┆┅┇╭─╮┏━┓╔╦╗┈┊│╳│┃┃╠╬╣┉┋╰─╯┗━┛' + '╚╩╝╲╱┞┟┠┡┢┦┧┨┩┪╉╊┭┮┯┰┱┲┵┶┷┸╇╈┹┺┽┾┿╀╁╂╃╄╅╆' + '○◇□△▽☆●◆■▲▼★♠♥♦♣☼☺◘♀√☻◙♂×▁▂▃▄▅▆▇█⊙◎۞卍卐╱╲▁▏↖↗↑←↔◤◥╲╱▔▕↙↘↓→↕◣◢∷▒░℡™';
_stopwords = (function() {
var chinese, latin;
latin = '.?!,:;(){}[]"\'';
chinese = '。。?!,、;:()[]〔〕【】﹃﹄﹁﹂《》〈〉…“”‘’„‚';
return latin + chinese;
})();
stopwords = (function() {

@@ -22,4 +27,5 @@ var result;

})();
latinAlphaPattern = /[a-zA-Z\xC0-\xD6\xD8-\xF6-\xFE]/;
isToken = function(text, index) {
var halfChar, nextChar, oriChar, prevChar, result, _i, _j, _k, _ref1, _ref2, _ref3, _ref4, _ref5, _ref6, _ref7, _ref8, _ref9, _results, _results1, _results2;
var halfChar, nextChar, oriChar, prevChar, result, _i, _ref1, _ref2, _ref3, _results;
oriChar = text.charAt(index);

@@ -32,3 +38,3 @@ halfChar = oriChar.toHalfwidth().toHalfwidthSpace();

case "'":
result = (stopwords[prevChar] != null) || (stopwords[nextChar] != null);
result = !(latinAlphaPattern.test(prevChar) && latinAlphaPattern.test(nextChar));
break;

@@ -42,16 +48,2 @@ case ".":

break;
case "-":
result = !(_ref4 = nextChar.charCodeAt(), __indexOf.call((function() {
_results1 = [];
for (var _j = _ref5 = '0'.charCodeAt(), _ref6 = '9'.charCodeAt(); _ref5 <= _ref6 ? _j <= _ref6 : _j >= _ref6; _ref5 <= _ref6 ? _j++ : _j--){ _results1.push(_j); }
return _results1;
}).apply(this), _ref4) >= 0);
break;
case "+":
result = !(_ref7 = nextChar.charCodeAt(), __indexOf.call((function() {
_results2 = [];
for (var _k = _ref8 = '0'.charCodeAt(), _ref9 = '9'.charCodeAt(); _ref8 <= _ref9 ? _k <= _ref9 : _k >= _ref9; _ref8 <= _ref9 ? _k++ : _k--){ _results2.push(_k); }
return _results2;
}).apply(this), _ref7) >= 0);
break;
default:

@@ -58,0 +50,0 @@ result = stopwords[oriChar] != null;

var urlPattern,
__indexOf = [].indexOf || function(item) { for (var i = 0, l = this.length; i < l; i++) { if (i in this && this[i] === item) return i; } return -1; };
urlPattern = /(?:http|https|ftp|sftp|git|ssh):(?:\/\/)(?:[-;:&=\+\$,\w]+@)?(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:['\!\(\)\*\-\w]*\.)*(?:['\!\(\)\*\-\w]*\.)(?:com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2})(?:\:[0-9]{1,5})?)/ig;
urlPattern = (function() {
var auth, domain, finalPart1, finalPart2, host1, host2, ip, ipPart, localhost, port, protocol;
protocol = "(?:http|https|ftp|sftp|git|ssh):(?://)";
auth = "(?:[-;:&=\\+\\$,\\w]+@)";
ipPart = "(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])";
ip = ipPart + '\\.' + ipPart + '\\.' + ipPart + '\\.' + ipPart;
localhost = "localhost";
host1 = "(?:['\\!\\*\\-\\w]*\\.)+";
host2 = "(?:['\\!\\*\\-\\w]*\\.){2,}";
domain = "(?:com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2})";
port = "(?:\\:[0-9]{1,5})";
finalPart1 = protocol + auth + '?' + '(' + ip + '|' + localhost + '|' + host1 + domain + ')' + port + '?';
finalPart2 = '(' + ip + '|' + localhost + '|' + host2 + domain + ')' + port + '?';
return new RegExp("(" + finalPart1 + ")|(" + finalPart2 + ")", 'gi');
})();
module.exports = function(options) {
var extend, findOne, findURL, fn, name, noop, path, unique, _ref, _ref1;
_ref = require('../utils'), noop = _ref.noop, path = _ref.path, extend = _ref.extend, findOne = _ref.findOne, unique = _ref.unique;
var chnRange, extend, fn, matchSync, name, noop, path, puncs, tailAt, tailReg, _ref;
_ref = require('../utils'), noop = _ref.noop, path = _ref.path, extend = _ref.extend, matchSync = _ref.matchSync, chnRange = _ref.chnRange;
options = extend({
removeToken: false,
propName: 'url'
}, options);
options.skipProps = unique([options.propName].concat((_ref1 = options.skipProps) != null ? _ref1 : []));
findURL = function(word) {
var index, matched, result, start, tail, tailAt, text, urlHost, value, _i, _len, _ref2;
tailAt = function(text, start) {
var char, index, prevChar, puncs, reachTail, _i, _ref2;
puncs = ['.', ',', ';', '!', '<', '>', '(', ')'];
reachTail = function(ch) {
return /\s|[\u4e00-\u9FFF]/.test(ch);
};
for (index = _i = start, _ref2 = text.length; start <= _ref2 ? _i < _ref2 : _i > _ref2; index = start <= _ref2 ? ++_i : --_i) {
char = text.charAt(index);
if (reachTail(char)) {
break;
}
}
if (index > 0) {
prevChar = text.charAt(index - 1);
if (__indexOf.call(puncs, prevChar) >= 0) {
index -= 1;
}
}
return index;
tailReg = new RegExp("\\s|[" + chnRange + "]");
puncs = (function() {
var chinese, latin;
latin = '.?!,:;(){}[]"\'';
chinese = '。。?!,、;:()[]〔〕【】﹃﹄﹁﹂《》〈〉…“”‘’„‚';
return (latin + chinese).split('');
})();
tailAt = function(text, start) {
var char, index, prevChar, reachTail, _i, _ref1;
reachTail = function(ch) {
return tailReg.test(ch);
};
result = [];
matched = word.w.match(urlPattern);
if (matched != null) {
start = word.start;
text = word.w;
for (_i = 0, _len = matched.length; _i < _len; _i++) {
urlHost = matched[_i];
index = text.indexOf(urlHost);
tail = tailAt(text, index);
if (index > 0) {
value = {
w: text.slice(0, index),
start: start
};
if (word.props != null) {
value.props = word.props;
}
result.push(value);
}
value = {
w: text.slice(index, tail),
start: start + index,
props: (_ref2 = word.props) != null ? _ref2 : {}
};
value.props[options.propName] = 1;
result.push(value);
text = text.slice(tail);
start += tail;
for (index = _i = start, _ref1 = text.length; start <= _ref1 ? _i < _ref1 : _i > _ref1; index = start <= _ref1 ? ++_i : --_i) {
char = text.charAt(index);
if (reachTail(char)) {
break;
}
if (start < word.w.length) {
value = {
w: text,
start: start
};
if (word.props != null) {
value.props = word.props;
}
result.push(value);
}
if (index > 0) {
prevChar = text.charAt(index - 1);
if (__indexOf.call(puncs, prevChar) >= 0) {
index -= 1;
}
} else {
result.push(word);
}
return result;
return index;
};
name = path.basename(__filename, path.extname(__filename));
fn = function(words, next) {
var result, word, _i, _len;
if (next == null) {
next = noop;
}
result = [];
for (_i = 0, _len = words.length; _i < _len; _i++) {
word = words[_i];
if (findOne(word.props, options.skipProps)) {
result.push(word);
} else {
result = result.concat(findURL(word));
}
}
return next(null, result);
options.pattern = urlPattern;
options.tailAt = tailAt;
return next(null, matchSync(words, options));
};

@@ -97,0 +61,0 @@ return {

var __indexOf = [].indexOf || function(item) { for (var i = 0, l = this.length; i < l; i++) { if (i in this && this[i] === item) return i; } return -1; };
module.exports = function(options) {
var extWesternChars, extend, fn, hanzenkaku, isToken, name, noop, path, tokenizeSync, unique, _ref;
var extWesternChars, extend, findMatchSync, fn, fullToHalf, hanzenkaku, isToken, name, newProps, noop, numberIdentify, path, tokenizeSync, unique, versionIdentify, _ref;
hanzenkaku = require('hanzenkaku').HanZenKaku;
_ref = require('../utils'), noop = _ref.noop, path = _ref.path, extend = _ref.extend, unique = _ref.unique, tokenizeSync = _ref.tokenizeSync;
_ref = require('../utils'), noop = _ref.noop, path = _ref.path, extend = _ref.extend, unique = _ref.unique, tokenizeSync = _ref.tokenizeSync, findMatchSync = _ref.findMatchSync, newProps = _ref.newProps;
options = extend({
removeToken: false,
propName: 'western',
fullToHalf: false
propName: 'western'
}, options);

@@ -20,2 +19,57 @@ extWesternChars = [8361];

options.isToken = isToken;
fullToHalf = function(words) {
var result;
return result = words.map(function(row) {
var _ref1;
if (((_ref1 = row.props) != null ? _ref1[options.propName] : void 0) != null) {
row.w = row.w.toHalfwidth().toHalfwidthSpace();
return row;
} else {
return row;
}
});
};
versionIdentify = function(words) {
var opt, result;
opt = {
pattern: /\d+(\.\d+){2,3}/gi,
propName: 'western.version'
};
result = [];
words.forEach(function(word) {
var _ref1;
if (((_ref1 = word.props) != null ? _ref1['western'] : void 0) == null) {
return result.push(word);
} else {
return result = result.concat(findMatchSync(word, opt));
}
});
return result;
};
numberIdentify = function(words) {
var opt, result;
opt = {
pattern: /((\d+([\,]\d{3})*)(\.\d+)?)|(\.\d+)/gi,
propName: 'western.number'
};
result = [];
words.forEach(function(word) {
var res, _ref1, _ref2, _ref3;
if (((_ref1 = word.props) != null ? _ref1['western'] : void 0) == null) {
return result.push(word);
} else if ((((_ref2 = word.props) != null ? _ref2['western.version'] : void 0) != null) || (((_ref3 = word.props) != null ? _ref3['western.number'] : void 0) != null)) {
return result.push(word);
} else {
res = findMatchSync(word, opt).map(function(row) {
var _ref4;
if ((((_ref4 = row.props) != null ? _ref4['western.number'] : void 0) != null) && row.w.indexOf('.') >= 0) {
row.props['western.number.fraction'] = 1;
}
return row;
});
return result = result.concat(res);
}
});
return result;
};
name = path.basename(__filename, path.extname(__filename));

@@ -27,14 +81,4 @@ fn = function(words, next) {

}
result = tokenizeSync(words, options);
if (options.fullToHalf) {
result = result.map(function(row) {
var _ref1;
if (((_ref1 = row.props) != null ? _ref1[options.propName] : void 0) != null) {
row.w = row.w.toHalfwidth().toHalfwidthSpace();
return row;
} else {
return row;
}
});
}
result = versionIdentify(fullToHalf(tokenizeSync(words, options)));
result = numberIdentify(result);
return next(null, result);

@@ -41,0 +85,0 @@ };

@@ -16,2 +16,3 @@ var Segment, basename, exports, fs, middleware, path, proto, utils;

utils.merge(this, proto);
this.utils = utils;
this.stack = [];

@@ -18,0 +19,0 @@ }

@@ -1,2 +0,2 @@

var eql, extend, findOne, merge, noop, path, splitWordSync, tokenizeSync, unique;
var chnRange, clone, eql, extend, findMatchSync, findOne, matchSync, merge, newProps, noop, path, splitWordSync, tokenizeSync, unique;

@@ -11,2 +11,4 @@ noop = exports.noop = function() {};

chnRange = exports.chnRange = '\\u2E80-\\uFE4F';
merge = exports.merge = function(a, b) {

@@ -23,2 +25,46 @@ var key, value;

clone = function(obj) {
var flags, key, newInstance;
if ((obj == null) || typeof obj !== 'object') {
return obj;
}
if (obj instanceof Date) {
return new Date(obj.getTime());
}
if (obj instanceof RegExp) {
flags = '';
if (obj.global != null) {
flags += 'g';
}
if (obj.ignoreCase != null) {
flags += 'i';
}
if (obj.multiline != null) {
flags += 'm';
}
if (obj.sticky != null) {
flags += 'y';
}
return new RegExp(obj.source, flags);
}
newInstance = new obj.constructor();
for (key in obj) {
newInstance[key] = clone(obj[key]);
}
return newInstance;
};
newProps = exports.newProps = function(a, propName) {
var c;
if (propName == null) {
return a;
}
c = clone(a);
if (c == null) {
c = {};
}
c[propName] = 1;
return c;
};
unique = exports.unique = function(arr) {

@@ -81,5 +127,4 @@ var key, output, value, _i, _ref, _results;

start: start + lastPos,
props: extend({}, word.props)
props: newProps(word.props, propName)
};
value.props[propName] = 1;
result.push(value);

@@ -100,3 +145,5 @@ }

value.props = extend({}, word.props);
value.props[propName] = 1;
if (propName != null) {
value.props[propName] = 1;
}
result.push(value);

@@ -112,3 +159,3 @@ }

tokenizeSync = exports.tokenizeSync = function(words, options) {
var result, word, _i, _len, _ref;
var result, word, _i, _len;
if (options == null) {

@@ -123,3 +170,2 @@ throw new Error('options required');

}
options.skipProps = unique([options.propName].concat((_ref = options.skipProps) != null ? _ref : []));
if (options.removeToken == null) {

@@ -131,3 +177,3 @@ options.removeToken = false;

word = words[_i];
if (findOne(word.props, options.skipProps)) {
if (word.props != null) {
result.push(word);

@@ -140,1 +186,75 @@ } else {

};
findMatchSync = exports.findMatchSync = function(word, options) {
var allMatched, index, oneMatched, removeSplitter, result, start, tail, text, value, _i, _len, _ref;
result = [];
allMatched = word.w.match(options.pattern);
removeSplitter = (_ref = options.removeSplitter) != null ? _ref : false;
if (allMatched != null) {
start = word.start;
text = word.w;
for (_i = 0, _len = allMatched.length; _i < _len; _i++) {
oneMatched = allMatched[_i];
index = text.indexOf(oneMatched);
if (options.tailAt != null) {
tail = options.tailAt(text, index);
} else {
tail = index + oneMatched.length;
}
if (!removeSplitter && index > 0) {
value = {
w: text.slice(0, index),
start: start
};
if (word.props != null) {
value.props = word.props;
}
result.push(value);
}
value = {
w: text.slice(index, tail),
start: start + index,
props: newProps(word.props, options.propName)
};
result.push(value);
text = text.slice(tail);
start += tail;
}
if (!removeSplitter && start < word.w.length) {
value = {
w: text,
start: start
};
if (word.props != null) {
value.props = word.props;
}
result.push(value);
}
} else {
result.push(word);
}
return result;
};
matchSync = exports.matchSync = function(words, options) {
var result, word, _i, _len;
if (options == null) {
throw new Error('options required');
}
if (options.pattern == null) {
throw new Error('options.pattern required');
}
if (options.propName == null) {
throw new Error('options.propName required');
}
result = [];
for (_i = 0, _len = words.length; _i < _len; _i++) {
word = words[_i];
if (word.props != null) {
result.push(word);
} else {
result = result.concat(findMatchSync(word, options));
}
}
return result;
};
{
"name": "chinese-seg",
"version": "0.0.0",
"version": "0.0.1",
"description": "Implement Chinese text segmentation algorithm",

@@ -5,0 +5,0 @@ "main": "index.js",

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc