chinese-seg
Advanced tools
Comparing version 0.0.0 to 0.0.1
@@ -311,1 +311,4 @@ {"word":"工信处","props":{"机构团体":1},"freq":100} | ||
{"word":"艘","props":{"数词":1},"freq":101} | ||
{"word":"微博","props":{"名词":1},"freq":10000} | ||
{"word":"微信","props":{"名词":1},"freq":10000} | ||
{"word":"工作坊","props":{"名词":1},"freq":2000} |
var __indexOf = [].indexOf || function(item) { for (var i = 0, l = this.length; i < l; i++) { if (i in this && this[i] === item) return i; } return -1; }; | ||
module.exports = function(options) { | ||
var extend, fn, hanzenkaku, isToken, name, noop, path, stopwords, tokenizeSync, unique, _ref, _stopwords; | ||
var extend, fn, hanzenkaku, isToken, latinAlphaPattern, name, noop, path, stopwords, tokenizeSync, unique, _ref, _stopwords; | ||
hanzenkaku = require('hanzenkaku').HanZenKaku; | ||
@@ -12,3 +12,8 @@ _ref = require('../utils'), noop = _ref.noop, path = _ref.path, extend = _ref.extend, unique = _ref.unique, tokenizeSync = _ref.tokenizeSync; | ||
options.skipProps = unique([options.propName].concat(options.skipProps)); | ||
_stopwords = '\u3000 ,.;+-|/\\\'":?<>[]{}=!@#$%^&*()~`' + '。,、':∶;?‘’“”〝〞ˆˇ﹕︰﹔﹖﹑·¨….¸;!´?!~—ˉ|‖"〃`@﹫¡¿﹏﹋﹌︴々﹟#﹩$﹠&﹪%*﹡﹢﹦' + '﹤‐ ̄¯―﹨ˆ˜﹍﹎+=<__-\ˇ~﹉﹊()〈〉‹›﹛﹜『』〖〗[]《》〔〕{}「」【】︵︷︿︹︽_﹁﹃︻︶︸' + '﹀︺︾ˉ﹂﹄︼+-×÷﹢﹣±/=≈≡≠∧∨∑∏∪∩∈⊙⌒⊥∥∠∽≌<>≤≥≮≯∧∨√﹙﹚[]﹛﹜∫∮∝∞⊙∏' + '┌┬┐┏┳┓╒╤╕─│├┼┤┣╋┫╞╪╡━┃└┴┘┗┻┛╘╧╛┄┆┅┇╭─╮┏━┓╔╦╗┈┊│╳│┃┃╠╬╣┉┋╰─╯┗━┛' + '╚╩╝╲╱┞┟┠┡┢┦┧┨┩┪╉╊┭┮┯┰┱┲┵┶┷┸╇╈┹┺┽┾┿╀╁╂╃╄╅╆' + '○◇□△▽☆●◆■▲▼★♠♥♦♣☼☺◘♀√☻◙♂×▁▂▃▄▅▆▇█⊙◎۞卍卐╱╲▁▏↖↗↑←↔◤◥╲╱▔▕↙↘↓→↕◣◢∷▒░℡™'; | ||
_stopwords = (function() { | ||
var chinese, latin; | ||
latin = '.?!,:;(){}[]"\''; | ||
chinese = '。。?!,、;:()[]〔〕【】﹃﹄﹁﹂《》〈〉…“”‘’„‚'; | ||
return latin + chinese; | ||
})(); | ||
stopwords = (function() { | ||
@@ -22,4 +27,5 @@ var result; | ||
})(); | ||
latinAlphaPattern = /[a-zA-Z\xC0-\xD6\xD8-\xF6-\xFE]/; | ||
isToken = function(text, index) { | ||
var halfChar, nextChar, oriChar, prevChar, result, _i, _j, _k, _ref1, _ref2, _ref3, _ref4, _ref5, _ref6, _ref7, _ref8, _ref9, _results, _results1, _results2; | ||
var halfChar, nextChar, oriChar, prevChar, result, _i, _ref1, _ref2, _ref3, _results; | ||
oriChar = text.charAt(index); | ||
@@ -32,3 +38,3 @@ halfChar = oriChar.toHalfwidth().toHalfwidthSpace(); | ||
case "'": | ||
result = (stopwords[prevChar] != null) || (stopwords[nextChar] != null); | ||
result = !(latinAlphaPattern.test(prevChar) && latinAlphaPattern.test(nextChar)); | ||
break; | ||
@@ -42,16 +48,2 @@ case ".": | ||
break; | ||
case "-": | ||
result = !(_ref4 = nextChar.charCodeAt(), __indexOf.call((function() { | ||
_results1 = []; | ||
for (var _j = _ref5 = '0'.charCodeAt(), _ref6 = '9'.charCodeAt(); _ref5 <= _ref6 ? _j <= _ref6 : _j >= _ref6; _ref5 <= _ref6 ? _j++ : _j--){ _results1.push(_j); } | ||
return _results1; | ||
}).apply(this), _ref4) >= 0); | ||
break; | ||
case "+": | ||
result = !(_ref7 = nextChar.charCodeAt(), __indexOf.call((function() { | ||
_results2 = []; | ||
for (var _k = _ref8 = '0'.charCodeAt(), _ref9 = '9'.charCodeAt(); _ref8 <= _ref9 ? _k <= _ref9 : _k >= _ref9; _ref8 <= _ref9 ? _k++ : _k--){ _results2.push(_k); } | ||
return _results2; | ||
}).apply(this), _ref7) >= 0); | ||
break; | ||
default: | ||
@@ -58,0 +50,0 @@ result = stopwords[oriChar] != null; |
var urlPattern, | ||
__indexOf = [].indexOf || function(item) { for (var i = 0, l = this.length; i < l; i++) { if (i in this && this[i] === item) return i; } return -1; }; | ||
urlPattern = /(?:http|https|ftp|sftp|git|ssh):(?:\/\/)(?:[-;:&=\+\$,\w]+@)?(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:['\!\(\)\*\-\w]*\.)*(?:['\!\(\)\*\-\w]*\.)(?:com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2})(?:\:[0-9]{1,5})?)/ig; | ||
urlPattern = (function() { | ||
var auth, domain, finalPart1, finalPart2, host1, host2, ip, ipPart, localhost, port, protocol; | ||
protocol = "(?:http|https|ftp|sftp|git|ssh):(?://)"; | ||
auth = "(?:[-;:&=\\+\\$,\\w]+@)"; | ||
ipPart = "(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])"; | ||
ip = ipPart + '\\.' + ipPart + '\\.' + ipPart + '\\.' + ipPart; | ||
localhost = "localhost"; | ||
host1 = "(?:['\\!\\*\\-\\w]*\\.)+"; | ||
host2 = "(?:['\\!\\*\\-\\w]*\\.){2,}"; | ||
domain = "(?:com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2})"; | ||
port = "(?:\\:[0-9]{1,5})"; | ||
finalPart1 = protocol + auth + '?' + '(' + ip + '|' + localhost + '|' + host1 + domain + ')' + port + '?'; | ||
finalPart2 = '(' + ip + '|' + localhost + '|' + host2 + domain + ')' + port + '?'; | ||
return new RegExp("(" + finalPart1 + ")|(" + finalPart2 + ")", 'gi'); | ||
})(); | ||
module.exports = function(options) { | ||
var extend, findOne, findURL, fn, name, noop, path, unique, _ref, _ref1; | ||
_ref = require('../utils'), noop = _ref.noop, path = _ref.path, extend = _ref.extend, findOne = _ref.findOne, unique = _ref.unique; | ||
var chnRange, extend, fn, matchSync, name, noop, path, puncs, tailAt, tailReg, _ref; | ||
_ref = require('../utils'), noop = _ref.noop, path = _ref.path, extend = _ref.extend, matchSync = _ref.matchSync, chnRange = _ref.chnRange; | ||
options = extend({ | ||
removeToken: false, | ||
propName: 'url' | ||
}, options); | ||
options.skipProps = unique([options.propName].concat((_ref1 = options.skipProps) != null ? _ref1 : [])); | ||
findURL = function(word) { | ||
var index, matched, result, start, tail, tailAt, text, urlHost, value, _i, _len, _ref2; | ||
tailAt = function(text, start) { | ||
var char, index, prevChar, puncs, reachTail, _i, _ref2; | ||
puncs = ['.', ',', ';', '!', '<', '>', '(', ')']; | ||
reachTail = function(ch) { | ||
return /\s|[\u4e00-\u9FFF]/.test(ch); | ||
}; | ||
for (index = _i = start, _ref2 = text.length; start <= _ref2 ? _i < _ref2 : _i > _ref2; index = start <= _ref2 ? ++_i : --_i) { | ||
char = text.charAt(index); | ||
if (reachTail(char)) { | ||
break; | ||
} | ||
} | ||
if (index > 0) { | ||
prevChar = text.charAt(index - 1); | ||
if (__indexOf.call(puncs, prevChar) >= 0) { | ||
index -= 1; | ||
} | ||
} | ||
return index; | ||
tailReg = new RegExp("\\s|[" + chnRange + "]"); | ||
puncs = (function() { | ||
var chinese, latin; | ||
latin = '.?!,:;(){}[]"\''; | ||
chinese = '。。?!,、;:()[]〔〕【】﹃﹄﹁﹂《》〈〉…“”‘’„‚'; | ||
return (latin + chinese).split(''); | ||
})(); | ||
tailAt = function(text, start) { | ||
var char, index, prevChar, reachTail, _i, _ref1; | ||
reachTail = function(ch) { | ||
return tailReg.test(ch); | ||
}; | ||
result = []; | ||
matched = word.w.match(urlPattern); | ||
if (matched != null) { | ||
start = word.start; | ||
text = word.w; | ||
for (_i = 0, _len = matched.length; _i < _len; _i++) { | ||
urlHost = matched[_i]; | ||
index = text.indexOf(urlHost); | ||
tail = tailAt(text, index); | ||
if (index > 0) { | ||
value = { | ||
w: text.slice(0, index), | ||
start: start | ||
}; | ||
if (word.props != null) { | ||
value.props = word.props; | ||
} | ||
result.push(value); | ||
} | ||
value = { | ||
w: text.slice(index, tail), | ||
start: start + index, | ||
props: (_ref2 = word.props) != null ? _ref2 : {} | ||
}; | ||
value.props[options.propName] = 1; | ||
result.push(value); | ||
text = text.slice(tail); | ||
start += tail; | ||
for (index = _i = start, _ref1 = text.length; start <= _ref1 ? _i < _ref1 : _i > _ref1; index = start <= _ref1 ? ++_i : --_i) { | ||
char = text.charAt(index); | ||
if (reachTail(char)) { | ||
break; | ||
} | ||
if (start < word.w.length) { | ||
value = { | ||
w: text, | ||
start: start | ||
}; | ||
if (word.props != null) { | ||
value.props = word.props; | ||
} | ||
result.push(value); | ||
} | ||
if (index > 0) { | ||
prevChar = text.charAt(index - 1); | ||
if (__indexOf.call(puncs, prevChar) >= 0) { | ||
index -= 1; | ||
} | ||
} else { | ||
result.push(word); | ||
} | ||
return result; | ||
return index; | ||
}; | ||
name = path.basename(__filename, path.extname(__filename)); | ||
fn = function(words, next) { | ||
var result, word, _i, _len; | ||
if (next == null) { | ||
next = noop; | ||
} | ||
result = []; | ||
for (_i = 0, _len = words.length; _i < _len; _i++) { | ||
word = words[_i]; | ||
if (findOne(word.props, options.skipProps)) { | ||
result.push(word); | ||
} else { | ||
result = result.concat(findURL(word)); | ||
} | ||
} | ||
return next(null, result); | ||
options.pattern = urlPattern; | ||
options.tailAt = tailAt; | ||
return next(null, matchSync(words, options)); | ||
}; | ||
@@ -97,0 +61,0 @@ return { |
var __indexOf = [].indexOf || function(item) { for (var i = 0, l = this.length; i < l; i++) { if (i in this && this[i] === item) return i; } return -1; }; | ||
module.exports = function(options) { | ||
var extWesternChars, extend, fn, hanzenkaku, isToken, name, noop, path, tokenizeSync, unique, _ref; | ||
var extWesternChars, extend, findMatchSync, fn, fullToHalf, hanzenkaku, isToken, name, newProps, noop, numberIdentify, path, tokenizeSync, unique, versionIdentify, _ref; | ||
hanzenkaku = require('hanzenkaku').HanZenKaku; | ||
_ref = require('../utils'), noop = _ref.noop, path = _ref.path, extend = _ref.extend, unique = _ref.unique, tokenizeSync = _ref.tokenizeSync; | ||
_ref = require('../utils'), noop = _ref.noop, path = _ref.path, extend = _ref.extend, unique = _ref.unique, tokenizeSync = _ref.tokenizeSync, findMatchSync = _ref.findMatchSync, newProps = _ref.newProps; | ||
options = extend({ | ||
removeToken: false, | ||
propName: 'western', | ||
fullToHalf: false | ||
propName: 'western' | ||
}, options); | ||
@@ -20,2 +19,57 @@ extWesternChars = [8361]; | ||
options.isToken = isToken; | ||
fullToHalf = function(words) { | ||
var result; | ||
return result = words.map(function(row) { | ||
var _ref1; | ||
if (((_ref1 = row.props) != null ? _ref1[options.propName] : void 0) != null) { | ||
row.w = row.w.toHalfwidth().toHalfwidthSpace(); | ||
return row; | ||
} else { | ||
return row; | ||
} | ||
}); | ||
}; | ||
versionIdentify = function(words) { | ||
var opt, result; | ||
opt = { | ||
pattern: /\d+(\.\d+){2,3}/gi, | ||
propName: 'western.version' | ||
}; | ||
result = []; | ||
words.forEach(function(word) { | ||
var _ref1; | ||
if (((_ref1 = word.props) != null ? _ref1['western'] : void 0) == null) { | ||
return result.push(word); | ||
} else { | ||
return result = result.concat(findMatchSync(word, opt)); | ||
} | ||
}); | ||
return result; | ||
}; | ||
numberIdentify = function(words) { | ||
var opt, result; | ||
opt = { | ||
pattern: /((\d+([\,]\d{3})*)(\.\d+)?)|(\.\d+)/gi, | ||
propName: 'western.number' | ||
}; | ||
result = []; | ||
words.forEach(function(word) { | ||
var res, _ref1, _ref2, _ref3; | ||
if (((_ref1 = word.props) != null ? _ref1['western'] : void 0) == null) { | ||
return result.push(word); | ||
} else if ((((_ref2 = word.props) != null ? _ref2['western.version'] : void 0) != null) || (((_ref3 = word.props) != null ? _ref3['western.number'] : void 0) != null)) { | ||
return result.push(word); | ||
} else { | ||
res = findMatchSync(word, opt).map(function(row) { | ||
var _ref4; | ||
if ((((_ref4 = row.props) != null ? _ref4['western.number'] : void 0) != null) && row.w.indexOf('.') >= 0) { | ||
row.props['western.number.fraction'] = 1; | ||
} | ||
return row; | ||
}); | ||
return result = result.concat(res); | ||
} | ||
}); | ||
return result; | ||
}; | ||
name = path.basename(__filename, path.extname(__filename)); | ||
@@ -27,14 +81,4 @@ fn = function(words, next) { | ||
} | ||
result = tokenizeSync(words, options); | ||
if (options.fullToHalf) { | ||
result = result.map(function(row) { | ||
var _ref1; | ||
if (((_ref1 = row.props) != null ? _ref1[options.propName] : void 0) != null) { | ||
row.w = row.w.toHalfwidth().toHalfwidthSpace(); | ||
return row; | ||
} else { | ||
return row; | ||
} | ||
}); | ||
} | ||
result = versionIdentify(fullToHalf(tokenizeSync(words, options))); | ||
result = numberIdentify(result); | ||
return next(null, result); | ||
@@ -41,0 +85,0 @@ }; |
@@ -16,2 +16,3 @@ var Segment, basename, exports, fs, middleware, path, proto, utils; | ||
utils.merge(this, proto); | ||
this.utils = utils; | ||
this.stack = []; | ||
@@ -18,0 +19,0 @@ } |
134
lib/utils.js
@@ -1,2 +0,2 @@ | ||
var eql, extend, findOne, merge, noop, path, splitWordSync, tokenizeSync, unique; | ||
var chnRange, clone, eql, extend, findMatchSync, findOne, matchSync, merge, newProps, noop, path, splitWordSync, tokenizeSync, unique; | ||
@@ -11,2 +11,4 @@ noop = exports.noop = function() {}; | ||
chnRange = exports.chnRange = '\\u2E80-\\uFE4F'; | ||
merge = exports.merge = function(a, b) { | ||
@@ -23,2 +25,46 @@ var key, value; | ||
clone = function(obj) { | ||
var flags, key, newInstance; | ||
if ((obj == null) || typeof obj !== 'object') { | ||
return obj; | ||
} | ||
if (obj instanceof Date) { | ||
return new Date(obj.getTime()); | ||
} | ||
if (obj instanceof RegExp) { | ||
flags = ''; | ||
if (obj.global != null) { | ||
flags += 'g'; | ||
} | ||
if (obj.ignoreCase != null) { | ||
flags += 'i'; | ||
} | ||
if (obj.multiline != null) { | ||
flags += 'm'; | ||
} | ||
if (obj.sticky != null) { | ||
flags += 'y'; | ||
} | ||
return new RegExp(obj.source, flags); | ||
} | ||
newInstance = new obj.constructor(); | ||
for (key in obj) { | ||
newInstance[key] = clone(obj[key]); | ||
} | ||
return newInstance; | ||
}; | ||
newProps = exports.newProps = function(a, propName) { | ||
var c; | ||
if (propName == null) { | ||
return a; | ||
} | ||
c = clone(a); | ||
if (c == null) { | ||
c = {}; | ||
} | ||
c[propName] = 1; | ||
return c; | ||
}; | ||
unique = exports.unique = function(arr) { | ||
@@ -81,5 +127,4 @@ var key, output, value, _i, _ref, _results; | ||
start: start + lastPos, | ||
props: extend({}, word.props) | ||
props: newProps(word.props, propName) | ||
}; | ||
value.props[propName] = 1; | ||
result.push(value); | ||
@@ -100,3 +145,5 @@ } | ||
value.props = extend({}, word.props); | ||
value.props[propName] = 1; | ||
if (propName != null) { | ||
value.props[propName] = 1; | ||
} | ||
result.push(value); | ||
@@ -112,3 +159,3 @@ } | ||
tokenizeSync = exports.tokenizeSync = function(words, options) { | ||
var result, word, _i, _len, _ref; | ||
var result, word, _i, _len; | ||
if (options == null) { | ||
@@ -123,3 +170,2 @@ throw new Error('options required'); | ||
} | ||
options.skipProps = unique([options.propName].concat((_ref = options.skipProps) != null ? _ref : [])); | ||
if (options.removeToken == null) { | ||
@@ -131,3 +177,3 @@ options.removeToken = false; | ||
word = words[_i]; | ||
if (findOne(word.props, options.skipProps)) { | ||
if (word.props != null) { | ||
result.push(word); | ||
@@ -140,1 +186,75 @@ } else { | ||
}; | ||
findMatchSync = exports.findMatchSync = function(word, options) { | ||
var allMatched, index, oneMatched, removeSplitter, result, start, tail, text, value, _i, _len, _ref; | ||
result = []; | ||
allMatched = word.w.match(options.pattern); | ||
removeSplitter = (_ref = options.removeSplitter) != null ? _ref : false; | ||
if (allMatched != null) { | ||
start = word.start; | ||
text = word.w; | ||
for (_i = 0, _len = allMatched.length; _i < _len; _i++) { | ||
oneMatched = allMatched[_i]; | ||
index = text.indexOf(oneMatched); | ||
if (options.tailAt != null) { | ||
tail = options.tailAt(text, index); | ||
} else { | ||
tail = index + oneMatched.length; | ||
} | ||
if (!removeSplitter && index > 0) { | ||
value = { | ||
w: text.slice(0, index), | ||
start: start | ||
}; | ||
if (word.props != null) { | ||
value.props = word.props; | ||
} | ||
result.push(value); | ||
} | ||
value = { | ||
w: text.slice(index, tail), | ||
start: start + index, | ||
props: newProps(word.props, options.propName) | ||
}; | ||
result.push(value); | ||
text = text.slice(tail); | ||
start += tail; | ||
} | ||
if (!removeSplitter && start < word.w.length) { | ||
value = { | ||
w: text, | ||
start: start | ||
}; | ||
if (word.props != null) { | ||
value.props = word.props; | ||
} | ||
result.push(value); | ||
} | ||
} else { | ||
result.push(word); | ||
} | ||
return result; | ||
}; | ||
matchSync = exports.matchSync = function(words, options) { | ||
var result, word, _i, _len; | ||
if (options == null) { | ||
throw new Error('options required'); | ||
} | ||
if (options.pattern == null) { | ||
throw new Error('options.pattern required'); | ||
} | ||
if (options.propName == null) { | ||
throw new Error('options.propName required'); | ||
} | ||
result = []; | ||
for (_i = 0, _len = words.length; _i < _len; _i++) { | ||
word = words[_i]; | ||
if (word.props != null) { | ||
result.push(word); | ||
} else { | ||
result = result.concat(findMatchSync(word, options)); | ||
} | ||
} | ||
return result; | ||
}; |
{ | ||
"name": "chinese-seg", | ||
"version": "0.0.0", | ||
"version": "0.0.1", | ||
"description": "Implement Chinese text segmentation algorithm", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
8430767
47
1268
1